Last active
September 6, 2021 17:16
-
-
Save padjiman/5f82a1b1559b11f36a706c4b04e5ab59 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 125, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from sklearn import datasets\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"import matplotlib.pyplot as plt" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Preparing train/test sets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 126, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#directory = \"/Users/padjiman/data/KDDCUP1998/\"\n", | |
"directory = \"/Users/padjiman/data/bankVW/\"\n", | |
"data_file = \"train.vw\"\n", | |
"data = pd.read_csv(directory+data_file, header=None)\n", | |
"train, test = train_test_split(data, test_size=0.20 , random_state = 26 )\n", | |
"train.to_csv(directory+'split_train.vw', index=False, header=None)\n", | |
"test.to_csv(directory+'split_test.vw', index=False, header=None)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Actual train and test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 128, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"creating features for following interactions: ic \n", | |
"final_regressor = model.vw\n", | |
"Num weight bits = 26\n", | |
"learning rate = 0.5\n", | |
"initial_t = 0\n", | |
"power_t = 0.5\n", | |
"decay_learning_rate = 1\n", | |
"using cache_file = split_train.vw.cache\n", | |
"ignoring text input in favor of cache input\n", | |
"num sources = 1\n", | |
"average since example example current current current\n", | |
"loss last counter weight label predict features\n", | |
"0.693147 0.693147 1 1.0 1.0000 0.0000 60\n", | |
"1.109843 1.526538 2 2.0 -1.0000 1.2815 60\n", | |
"0.760195 0.410548 4 4.0 -1.0000 -0.2754 60\n", | |
"0.465710 0.171224 8 8.0 -1.0000 -0.7350 50\n", | |
"0.318235 0.170760 16 16.0 -1.0000 -2.7830 60\n", | |
"0.531552 0.744869 32 32.0 -1.0000 -6.1178 60\n", | |
"2.579228 4.626905 64 64.0 -1.0000 -3.2457 60\n", | |
"2.194169 1.809109 128 128.0 -1.0000 -2.9788 60\n", | |
"2.114863 2.035556 256 256.0 -1.0000 -1.4744 60\n", | |
"1.616420 1.117978 512 512.0 -1.0000 -1.9029 60\n", | |
"1.143370 0.670321 1024 1024.0 -1.0000 -3.5995 60\n", | |
"0.766199 0.389027 2048 2048.0 -1.0000 -3.3793 60\n", | |
"0.530801 0.295404 4096 4096.0 -1.0000 -4.4217 60\n", | |
"0.389938 0.249075 8192 8192.0 -1.0000 -2.8836 60\n", | |
"0.318798 0.247658 16384 16384.0 -1.0000 -3.9569 70\n", | |
"0.263437 0.263437 32768 32768.0 -1.0000 -4.6966 60 h\n", | |
"0.252186 0.240939 65536 65536.0 -1.0000 -4.0019 60 h\n", | |
"\n", | |
"finished run\n", | |
"number of examples per pass = 32552\n", | |
"passes used = 4\n", | |
"weighted example sum = 130208.000000\n", | |
"weighted label sum = -99792.000000\n", | |
"average loss = 0.239227 h\n", | |
"best constant = -2.023110\n", | |
"best constant's loss = 0.360496\n", | |
"total feature number = 7949680\n", | |
"creating features for following interactions: ic \n", | |
"only testing\n", | |
"predictions = preds.txt\n", | |
"Num weight bits = 26\n", | |
"learning rate = 0.5\n", | |
"initial_t = 0\n", | |
"power_t = 0.5\n", | |
"using no cache\n", | |
"Reading datafile = split_test.vw\n", | |
"num sources = 1\n", | |
"average since example example current current current\n", | |
"loss last counter weight label predict features\n", | |
"1.358600 1.358600 1 1.0 -1.0000 0.1029 60\n", | |
"7.592927 13.827254 2 2.0 -1.0000 0.0088 60\n", | |
"9.232505 10.872083 4 4.0 1.0000 0.1225 60\n", | |
"9.071785 8.911065 8 8.0 -1.0000 0.1698 60\n", | |
"9.430997 9.790208 16 16.0 -1.0000 0.0333 60\n", | |
"8.353756 7.276516 32 32.0 1.0000 0.2188 60\n", | |
"8.469951 8.586146 64 64.0 -1.0000 0.0321 60\n", | |
"7.585703 6.701455 128 128.0 -1.0000 0.0199 60\n", | |
"7.449527 7.313351 256 256.0 -1.0000 0.2331 60\n", | |
"7.174785 6.900042 512 512.0 -1.0000 0.1774 60\n", | |
"7.090911 7.007037 1024 1024.0 -1.0000 0.1080 60\n", | |
"7.169777 7.248644 2048 2048.0 -1.0000 0.0220 60\n", | |
"7.188963 7.208148 4096 4096.0 -1.0000 0.0207 60\n", | |
"7.211928 7.234893 8192 8192.0 -1.0000 0.0065 60\n", | |
"\n", | |
"finished run\n", | |
"number of examples per pass = 9043\n", | |
"passes used = 1\n", | |
"weighted example sum = 9043.000000\n", | |
"weighted label sum = -6955.000000\n", | |
"average loss = 7.205263\n", | |
"best constant = -0.769103\n", | |
"best constant's loss = 0.408480\n", | |
"total feature number = 552460\n" | |
] | |
} | |
], | |
"source": [ | |
"!cd $directory && vw split_train.vw -c --passes 4 -f model.vw --loss_function logistic --interactions ic -b 26\n", | |
"!cd $directory && vw split_test.vw -t -i model.vw -p preds.txt --link logistic" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Calculating the AUC" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 129, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.914318312778\n" | |
] | |
} | |
], | |
"source": [ | |
"preds = pd.read_csv(directory+'preds.txt', header=None)\n", | |
"test_split = pd.read_csv(directory+'split_test.vw', header=None, sep = '|')\n", | |
"from sklearn import metrics\n", | |
"fpr, tpr, thresholds = metrics.roc_curve(test_split[0].values, preds[0].values)\n", | |
"auc = metrics.auc(fpr, tpr)\n", | |
"print(auc)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment