Created
July 1, 2019 09:27
-
-
Save groverpr/76707faa6047296e647acc3f0c8978b5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Comparison of below encordings with random forest and final comparison with catboost\n", | |
"* Label Encoding\n", | |
"* One Hot Encoding\n", | |
"* Hash Encoding\n", | |
"* K-fold Target Encoding\n", | |
"* Ordered Target Encoding" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 523, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%reload_ext autoreload\n", | |
"%autoreload 2\n", | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 347, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import time\n", | |
"import random\n", | |
"import matplotlib.pyplot as plt\n", | |
"import string\n", | |
"from utils import *\n", | |
"from sklearn.datasets import *\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from sklearn.model_selection import StratifiedKFold, train_test_split\n", | |
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", | |
"from sklearn.metrics import roc_auc_score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 257, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import catboost\n", | |
"from catboost import CatBoostClassifier" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Dataset 1. \n", | |
"Simulated dataset with one feature. 10,000 data points, 100 level cardinality. Rule: Ones starting with vowel have target label 1 and others have target label 0" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Creating simulated dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 473, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['KcBEK',\n", | |
" 'anDFr',\n", | |
" 'PZkcH',\n", | |
" 'FuepV',\n", | |
" 'xcAiM',\n", | |
" 'wyAsR',\n", | |
" 'qDlRt',\n", | |
" 'QxiDX',\n", | |
" 'pCNyc',\n", | |
" 'Lapim']" | |
] | |
}, | |
"execution_count": 473, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"random.seed(10)\n", | |
"catvar = []\n", | |
"for i in range(100):\n", | |
" random_str = ''.join([random.choice(string.ascii_letters) for n in range(5)])\n", | |
" catvar.append(random_str)\n", | |
"catvar[:10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 474, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# training and testing data\n", | |
"np.random.seed(10)\n", | |
"train_data = np.random.choice(catvar, size=10000)\n", | |
"np.random.seed(11)\n", | |
"test_data = np.random.choice(catvar, size=1000)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 475, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# strings starting with vowels are positive\n", | |
"# train\n", | |
"label_train_list = [c.lower().startswith((\"a\",\"e\",\"i\",\"o\",\"u\")) for c in train_data]\n", | |
"label_train_list = np.array(label_train_list).astype(int)\n", | |
"# test\n", | |
"label_test_list = [c.lower().startswith((\"a\",\"e\",\"i\",\"o\",\"u\")) for c in test_data]\n", | |
"label_test_list = np.array(label_test_list).astype(int)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 476, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>x</th>\n", | |
" <th>y</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Lapim</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>QeIfj</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>MUtbL</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>HEMFe</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>tczMW</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" x y\n", | |
"0 Lapim 0\n", | |
"1 QeIfj 0\n", | |
"2 MUtbL 0\n", | |
"3 HEMFe 0\n", | |
"4 tczMW 0" | |
] | |
}, | |
"execution_count": 476, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_data_df = pd.DataFrame({\"x\": train_data, \"y\": label_train_list})\n", | |
"test_data_df = pd.DataFrame({\"x\": test_data, \"y\": label_test_list})\n", | |
"train_data_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 477, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(0.208, 0.211)" | |
] | |
}, | |
"execution_count": 477, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sum(label_train_list)/len(label_train_list), sum(label_test_list)/len(label_test_list)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Both test and train have around 20% positive labels with a little bit of difference" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Model comparisons" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 541, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"encoders = {\"one_hot\": one_hot_encoder, \n", | |
" \"numeric_label\": label_encoder,\n", | |
" \"hash\": hash_encoder, \n", | |
" \"kfold_target\": kfold_target_encoder, \n", | |
" \"catboost_target_encoder\": catboost_target_encoder}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 542, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"train_scores = {}\n", | |
"test_scores = {}\n", | |
"times = {}\n", | |
"for enc_name, enc in encoders.items():\n", | |
" train_score = 0.\n", | |
" test_score = 0.\n", | |
" st = time.time()\n", | |
" for i in range(10): # for measuring times\n", | |
" train_score_this_iter, test_score_this_iter = fitmodel_and_auc_score(enc, \n", | |
" train_data_df, \n", | |
" test_data_df, [\"x\"], \"y\") \n", | |
" train_score+=train_score_this_iter\n", | |
" test_score+=test_score_this_iter\n", | |
" # averaging score\n", | |
" train_score/=10.\n", | |
" test_score/=10.\n", | |
" times[enc_name] = time.time()-st \n", | |
" train_scores[enc_name] = train_score\n", | |
" test_scores[enc_name] = test_score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 543, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# now fitting on catboost\n", | |
"cat_features=[0]\n", | |
"train_score = 0.\n", | |
"test_score = 0.\n", | |
"st = time.time()\n", | |
"for i in range(10):\n", | |
" model = CatBoostClassifier(\n", | |
" iterations=500,\n", | |
" early_stopping_rounds=20)\n", | |
"\n", | |
" model.fit(\n", | |
" train_data_df[\"x\"], train_data_df[\"y\"],\n", | |
" cat_features=cat_features,\n", | |
" eval_set=(test_data_df[\"x\"], test_data_df[\"y\"]),\n", | |
" verbose=False\n", | |
" )\n", | |
" \n", | |
" train_score_this_iter = roc_auc_score(train_data_df.y, model.predict(train_data_df.x))\n", | |
" test_score_this_iter = roc_auc_score(test_data_df.y, model.predict(test_data_df.x))\n", | |
" train_score+=train_score_this_iter\n", | |
" test_score+=test_score_this_iter\n", | |
" # averaging score\n", | |
"train_score/=10.\n", | |
"test_score/=10.\n", | |
"times[\"catboost\"] = time.time()-st \n", | |
"train_scores[\"catboost\"] = train_score\n", | |
"test_scores[\"catboost\"] = test_score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 544, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'one_hot': 0.9963942307692308,\n", | |
" 'numeric_label': 1.0,\n", | |
" 'hash': 0.9690656565656568,\n", | |
" 'kfold_target': 1.0,\n", | |
" 'catboost_target_encoder': 0.9951923076923077,\n", | |
" 'catboost': 1.0}" | |
] | |
}, | |
"execution_count": 544, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_scores" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 545, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'one_hot': 0.9947867298578199,\n", | |
" 'numeric_label': 1.0,\n", | |
" 'hash': 0.9670468948035487,\n", | |
" 'kfold_target': 1.0,\n", | |
" 'catboost_target_encoder': 1.0,\n", | |
" 'catboost': 1.0}" | |
] | |
}, | |
"execution_count": 545, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_scores" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 546, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'one_hot': 10.497339248657227,\n", | |
" 'numeric_label': 11.937826156616211,\n", | |
" 'hash': 13.738202810287476,\n", | |
" 'kfold_target': 9.657154083251953,\n", | |
" 'catboost_target_encoder': 9.620632886886597,\n", | |
" 'catboost': 63.245049238204956}" | |
] | |
}, | |
"execution_count": 546, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"times" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 547, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 864x360 with 2 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"fig, ax = plt.subplots(1,1,figsize=(12,5))\n", | |
"\n", | |
"ax2 = ax.twinx()\n", | |
"ax.plot(train_scores.keys(), [round(t,2) for t in train_scores.values()], \"--\", color=\"g\", alpha=0.5)\n", | |
"ax.plot(test_scores.keys(), [round(t,2) for t in test_scores.values()], \"-.\", color=\"b\", alpha=0.8)\n", | |
"ax2.plot(times.keys(), [round(t,2) for t in times.values()], color=\"r\", alpha=0.6)\n", | |
"ax2.set_xlabel(\"Encoders\")\n", | |
"ax.set_ylabel(\"AUC Scores\")\n", | |
"ax2.set_ylabel(\"Time (seconds)\")\n", | |
"ax2.yaxis.tick_right()\n", | |
"ax.legend([\"Train Score\", \"Test Score\"])\n", | |
"ax2.legend([\"Time (seconds)\"])\n", | |
"\n", | |
"fig.tight_layout()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Dataset 2\n", | |
"Amazon.com - Employee Access Challenge data. Contains all categorical variables with different cardinalities. \n", | |
"https://www.kaggle.com/c/amazon-employee-access-challenge" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Loading dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 485, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from catboost.datasets import amazon" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 486, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"train, test = amazon()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 487, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"ACTION 2\n", | |
"RESOURCE 7518\n", | |
"MGR_ID 4243\n", | |
"ROLE_ROLLUP_1 128\n", | |
"ROLE_ROLLUP_2 177\n", | |
"ROLE_DEPTNAME 449\n", | |
"ROLE_TITLE 343\n", | |
"ROLE_FAMILY_DESC 2358\n", | |
"ROLE_FAMILY 67\n", | |
"ROLE_CODE 343\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 487, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# cardinality\n", | |
"train.apply(lambda x: len(x.unique()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 551, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(0.22942415087430193, 0.002044615337666697)" | |
] | |
}, | |
"execution_count": 551, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"7518/train.shape[0], 67/train.shape[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 488, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# ROLE_CODE == ROLE_TITLE\n", | |
"# removing one \n", | |
"train.drop(\"ROLE_TITLE\", axis=1, inplace=True)\n", | |
"test.drop(\"ROLE_TITLE\", axis=1, inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 489, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ACTION</th>\n", | |
" <th>RESOURCE</th>\n", | |
" <th>MGR_ID</th>\n", | |
" <th>ROLE_ROLLUP_1</th>\n", | |
" <th>ROLE_ROLLUP_2</th>\n", | |
" <th>ROLE_DEPTNAME</th>\n", | |
" <th>ROLE_FAMILY_DESC</th>\n", | |
" <th>ROLE_FAMILY</th>\n", | |
" <th>ROLE_CODE</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>39353</td>\n", | |
" <td>85475</td>\n", | |
" <td>117961</td>\n", | |
" <td>118300</td>\n", | |
" <td>123472</td>\n", | |
" <td>117906</td>\n", | |
" <td>290919</td>\n", | |
" <td>117908</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>17183</td>\n", | |
" <td>1540</td>\n", | |
" <td>117961</td>\n", | |
" <td>118343</td>\n", | |
" <td>123125</td>\n", | |
" <td>118536</td>\n", | |
" <td>308574</td>\n", | |
" <td>118539</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>36724</td>\n", | |
" <td>14457</td>\n", | |
" <td>118219</td>\n", | |
" <td>118220</td>\n", | |
" <td>117884</td>\n", | |
" <td>267952</td>\n", | |
" <td>19721</td>\n", | |
" <td>117880</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>36135</td>\n", | |
" <td>5396</td>\n", | |
" <td>117961</td>\n", | |
" <td>118343</td>\n", | |
" <td>119993</td>\n", | |
" <td>240983</td>\n", | |
" <td>290919</td>\n", | |
" <td>118322</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>42680</td>\n", | |
" <td>5905</td>\n", | |
" <td>117929</td>\n", | |
" <td>117930</td>\n", | |
" <td>119569</td>\n", | |
" <td>123932</td>\n", | |
" <td>19793</td>\n", | |
" <td>119325</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ACTION RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_ROLLUP_2 ROLE_DEPTNAME \\\n", | |
"0 1 39353 85475 117961 118300 123472 \n", | |
"1 1 17183 1540 117961 118343 123125 \n", | |
"2 1 36724 14457 118219 118220 117884 \n", | |
"3 1 36135 5396 117961 118343 119993 \n", | |
"4 1 42680 5905 117929 117930 119569 \n", | |
"\n", | |
" ROLE_FAMILY_DESC ROLE_FAMILY ROLE_CODE \n", | |
"0 117906 290919 117908 \n", | |
"1 118536 308574 118539 \n", | |
"2 267952 19721 117880 \n", | |
"3 240983 290919 118322 \n", | |
"4 123932 19793 119325 " | |
] | |
}, | |
"execution_count": 489, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 490, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Ones 0.94\n", | |
"Zeros 0.06\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"Ones\", round(sum(train.ACTION == 1)/len(train),2))\n", | |
"print(\"Zeros\", round(sum(train.ACTION == 0)/len(train),2))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 491, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(32769, 9)" | |
] | |
}, | |
"execution_count": 491, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 492, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"26215.2" | |
] | |
}, | |
"execution_count": 492, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"0.8*train.shape[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 493, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Randomly splitting train-test data (80,20)\n", | |
"np.random.seed(10)\n", | |
"train_sub = train.iloc[np.random.permutation(train.index)[:26215],:].reset_index(drop=True)\n", | |
"valid_sub = train.iloc[np.random.permutation(train.index)[26215:],:].reset_index(drop=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 494, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"((26215, 9), (6554, 9))" | |
] | |
}, | |
"execution_count": 494, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_sub.shape, valid_sub.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 495, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ACTION</th>\n", | |
" <th>RESOURCE</th>\n", | |
" <th>MGR_ID</th>\n", | |
" <th>ROLE_ROLLUP_1</th>\n", | |
" <th>ROLE_ROLLUP_2</th>\n", | |
" <th>ROLE_DEPTNAME</th>\n", | |
" <th>ROLE_FAMILY_DESC</th>\n", | |
" <th>ROLE_FAMILY</th>\n", | |
" <th>ROLE_CODE</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>78382</td>\n", | |
" <td>7001</td>\n", | |
" <td>117961</td>\n", | |
" <td>118327</td>\n", | |
" <td>118933</td>\n", | |
" <td>132108</td>\n", | |
" <td>4673</td>\n", | |
" <td>121596</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>74508</td>\n", | |
" <td>17561</td>\n", | |
" <td>91261</td>\n", | |
" <td>118026</td>\n", | |
" <td>118202</td>\n", | |
" <td>118260</td>\n", | |
" <td>290919</td>\n", | |
" <td>118261</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>17249</td>\n", | |
" <td>4914</td>\n", | |
" <td>117961</td>\n", | |
" <td>118300</td>\n", | |
" <td>120026</td>\n", | |
" <td>133686</td>\n", | |
" <td>118424</td>\n", | |
" <td>119435</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>971</td>\n", | |
" <td>3918</td>\n", | |
" <td>117961</td>\n", | |
" <td>118343</td>\n", | |
" <td>118660</td>\n", | |
" <td>250337</td>\n", | |
" <td>118424</td>\n", | |
" <td>120791</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>17171</td>\n", | |
" <td>1810</td>\n", | |
" <td>117961</td>\n", | |
" <td>118327</td>\n", | |
" <td>120559</td>\n", | |
" <td>117906</td>\n", | |
" <td>290919</td>\n", | |
" <td>117908</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ACTION RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_ROLLUP_2 ROLE_DEPTNAME \\\n", | |
"0 1 78382 7001 117961 118327 118933 \n", | |
"1 1 74508 17561 91261 118026 118202 \n", | |
"2 1 17249 4914 117961 118300 120026 \n", | |
"3 1 971 3918 117961 118343 118660 \n", | |
"4 1 17171 1810 117961 118327 120559 \n", | |
"\n", | |
" ROLE_FAMILY_DESC ROLE_FAMILY ROLE_CODE \n", | |
"0 132108 4673 121596 \n", | |
"1 118260 290919 118261 \n", | |
"2 133686 118424 119435 \n", | |
"3 250337 118424 120791 \n", | |
"4 117906 290919 117908 " | |
] | |
}, | |
"execution_count": 495, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_sub.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Modeling" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"1. Data is already numeric label encoded. So no need for that. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 548, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"encoders = {\"one_hot\": one_hot_encoder, \n", | |
" \"numeric_label\": None,\n", | |
" \"kfold_target\": kfold_target_encoder, \n", | |
" \"catboost_target_encoder\": catboost_target_encoder}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 549, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"targetfeatures = ['RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2',\n", | |
" 'ROLE_DEPTNAME', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY', 'ROLE_CODE']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 534, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"train_scores = {}\n", | |
"test_scores = {}\n", | |
"times = {}\n", | |
"for enc_name, enc in encoders.items():\n", | |
" train_score = 0.\n", | |
" test_score = 0.\n", | |
" st = time.time()\n", | |
" for i in range(1): # just running once as this data is large\n", | |
" train_score_this_iter, test_score_this_iter = fitmodel_and_auc_score(enc, \n", | |
" train_sub, \n", | |
" valid_sub,\n", | |
" targetfeatures, \n", | |
" \"ACTION\") \n", | |
" train_score+=train_score_this_iter\n", | |
" test_score+=test_score_this_iter\n", | |
" # averaging score\n", | |
" train_score/=1.\n", | |
" test_score/=1.\n", | |
" times[enc_name] = time.time()-st \n", | |
" train_scores[enc_name] = train_score\n", | |
" test_scores[enc_name] = test_score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 535, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# now fitting on catboost\n", | |
"cat_features=range(8)\n", | |
"train_score = 0.\n", | |
"test_score = 0.\n", | |
"st = time.time()\n", | |
"for i in range(1):\n", | |
" model = CatBoostClassifier(\n", | |
" iterations=500,\n", | |
" early_stopping_rounds=20, \n", | |
" eval_metric=\"AUC\")\n", | |
"\n", | |
" model.fit(\n", | |
" train_sub[targetfeatures], train_sub[\"ACTION\"],\n", | |
" cat_features=cat_features,\n", | |
" eval_set=(valid_sub[targetfeatures], valid_sub[\"ACTION\"]),\n", | |
" verbose=False\n", | |
" )\n", | |
" \n", | |
" train_score_this_iter = roc_auc_score(train_sub.ACTION, model.predict(train_sub[targetfeatures]))\n", | |
" test_score_this_iter = roc_auc_score(valid_sub.ACTION, model.predict(valid_sub[targetfeatures]))\n", | |
" train_score+=train_score_this_iter\n", | |
" test_score+=test_score_this_iter\n", | |
" # averaging score\n", | |
"train_score/=1.\n", | |
"test_score/=1.\n", | |
"times[\"catboost\"] = time.time()-st \n", | |
"train_scores[\"catboost\"] = train_score\n", | |
"test_scores[\"catboost\"] = test_score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 536, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'one_hot': 0.7323405414477682,\n", | |
" 'numeric_label': 0.8617465313668865,\n", | |
" 'kfold_target': 0.878025396681621,\n", | |
" 'catboost_target_encoder': 0.8418819465905094,\n", | |
" 'catboost': 0.9062952227704307}" | |
] | |
}, | |
"execution_count": 536, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_scores" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 537, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'one_hot': 0.7137792512590169,\n", | |
" 'numeric_label': 0.8418601468103788,\n", | |
" 'kfold_target': 0.849442427403292,\n", | |
" 'catboost_target_encoder': 0.818754352716162,\n", | |
" 'catboost': 0.8715391293807279}" | |
] | |
}, | |
"execution_count": 537, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_scores" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 538, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 864x360 with 2 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"fig, ax = plt.subplots(1,1,figsize=(12,5))\n", | |
"\n", | |
"ax2 = ax.twinx()\n", | |
"ax.plot(train_scores.keys(), [round(t,2) for t in train_scores.values()], \"--\", color=\"g\", alpha=0.5)\n", | |
"ax.plot(test_scores.keys(), [round(t,2) for t in test_scores.values()], \"-.\", color=\"b\", alpha=0.8)\n", | |
"ax2.plot(times.keys(), [round(t,2) for t in times.values()], color=\"r\", alpha=0.6)\n", | |
"ax2.set_xlabel(\"Encoders\")\n", | |
"ax.set_ylabel(\"AUC Scores\")\n", | |
"ax2.set_ylabel(\"Time (seconds)\")\n", | |
"ax2.yaxis.tick_right()\n", | |
"ax.legend([\"Train Score\", \"Test Score\"], loc=\"lower left\")\n", | |
"ax2.legend([\"Time (seconds)\"])\n", | |
"\n", | |
"fig.tight_layout()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## End" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
@gwerbin - Apologies for missing the imports in the notebook. Here is the link to fitmodel_and_auc_score
: https://github.com/groverpr/Machine-Learning/blob/master/catboost/utils.py#L96
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Where is
fitmodel_and_auc_score
defined?