padjiman · September 6, 2021 17:16
diff --git a/vw_blog.ipynb b/vw_blog.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import datasets\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preparing train/test sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "#directory = \"/Users/padjiman/data/KDDCUP1998/\"\n",
    "directory = \"/Users/padjiman/data/bankVW/\"\n",
    "data_file = \"train.vw\"\n",
    "data = pd.read_csv(directory+data_file, header=None)\n",
    "train, test = train_test_split(data, test_size=0.20 , random_state = 26 )\n",
    "train.to_csv(directory+'split_train.vw', index=False, header=None)\n",
    "test.to_csv(directory+'split_test.vw', index=False, header=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Actual train and test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "creating features for following interactions: ic \n",
      "final_regressor = model.vw\n",
      "Num weight bits = 26\n",
      "learning rate = 0.5\n",
      "initial_t = 0\n",
      "power_t = 0.5\n",
      "decay_learning_rate = 1\n",
      "using cache_file = split_train.vw.cache\n",
      "ignoring text input in favor of cache input\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "0.693147 0.693147            1            1.0   1.0000   0.0000       60\n",
      "1.109843 1.526538            2            2.0  -1.0000   1.2815       60\n",
      "0.760195 0.410548            4            4.0  -1.0000  -0.2754       60\n",
      "0.465710 0.171224            8            8.0  -1.0000  -0.7350       50\n",
      "0.318235 0.170760           16           16.0  -1.0000  -2.7830       60\n",
      "0.531552 0.744869           32           32.0  -1.0000  -6.1178       60\n",
      "2.579228 4.626905           64           64.0  -1.0000  -3.2457       60\n",
      "2.194169 1.809109          128          128.0  -1.0000  -2.9788       60\n",
      "2.114863 2.035556          256          256.0  -1.0000  -1.4744       60\n",
      "1.616420 1.117978          512          512.0  -1.0000  -1.9029       60\n",
      "1.143370 0.670321         1024         1024.0  -1.0000  -3.5995       60\n",
      "0.766199 0.389027         2048         2048.0  -1.0000  -3.3793       60\n",
      "0.530801 0.295404         4096         4096.0  -1.0000  -4.4217       60\n",
      "0.389938 0.249075         8192         8192.0  -1.0000  -2.8836       60\n",
      "0.318798 0.247658        16384        16384.0  -1.0000  -3.9569       70\n",
      "0.263437 0.263437        32768        32768.0  -1.0000  -4.6966       60 h\n",
      "0.252186 0.240939        65536        65536.0  -1.0000  -4.0019       60 h\n",
      "\n",
      "finished run\n",
      "number of examples per pass = 32552\n",
      "passes used = 4\n",
      "weighted example sum = 130208.000000\n",
      "weighted label sum = -99792.000000\n",
      "average loss = 0.239227 h\n",
      "best constant = -2.023110\n",
      "best constant's loss = 0.360496\n",
      "total feature number = 7949680\n",
      "creating features for following interactions: ic \n",
      "only testing\n",
      "predictions = preds.txt\n",
      "Num weight bits = 26\n",
      "learning rate = 0.5\n",
      "initial_t = 0\n",
      "power_t = 0.5\n",
      "using no cache\n",
      "Reading datafile = split_test.vw\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "1.358600 1.358600            1            1.0  -1.0000   0.1029       60\n",
      "7.592927 13.827254            2            2.0  -1.0000   0.0088       60\n",
      "9.232505 10.872083            4            4.0   1.0000   0.1225       60\n",
      "9.071785 8.911065            8            8.0  -1.0000   0.1698       60\n",
      "9.430997 9.790208           16           16.0  -1.0000   0.0333       60\n",
      "8.353756 7.276516           32           32.0   1.0000   0.2188       60\n",
      "8.469951 8.586146           64           64.0  -1.0000   0.0321       60\n",
      "7.585703 6.701455          128          128.0  -1.0000   0.0199       60\n",
      "7.449527 7.313351          256          256.0  -1.0000   0.2331       60\n",
      "7.174785 6.900042          512          512.0  -1.0000   0.1774       60\n",
      "7.090911 7.007037         1024         1024.0  -1.0000   0.1080       60\n",
      "7.169777 7.248644         2048         2048.0  -1.0000   0.0220       60\n",
      "7.188963 7.208148         4096         4096.0  -1.0000   0.0207       60\n",
      "7.211928 7.234893         8192         8192.0  -1.0000   0.0065       60\n",
      "\n",
      "finished run\n",
      "number of examples per pass = 9043\n",
      "passes used = 1\n",
      "weighted example sum = 9043.000000\n",
      "weighted label sum = -6955.000000\n",
      "average loss = 7.205263\n",
      "best constant = -0.769103\n",
      "best constant's loss = 0.408480\n",
      "total feature number = 552460\n"
     ]
    }
   ],
   "source": [
    "!cd $directory && vw split_train.vw -c --passes 4 -f model.vw --loss_function logistic --interactions ic -b 26\n",
    "!cd $directory && vw split_test.vw -t -i model.vw -p preds.txt  --link logistic"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculating the AUC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.914318312778\n"
     ]
    }
   ],
   "source": [
    "preds = pd.read_csv(directory+'preds.txt', header=None)\n",
    "test_split = pd.read_csv(directory+'split_test.vw', header=None, sep = '|')\n",
    "from sklearn import metrics\n",
    "fpr, tpr, thresholds = metrics.roc_curve(test_split[0].values, preds[0].values)\n",
    "auc = metrics.auc(fpr, tpr)\n",
    "print(auc)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 125,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"from sklearn import datasets\n",
	"from sklearn.model_selection import train_test_split\n",
	"import matplotlib.pyplot as plt"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Preparing train/test sets"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 126,
	"metadata": {},
	"outputs": [],
	"source": [
	"#directory = \"/Users/padjiman/data/KDDCUP1998/\"\n",
	"directory = \"/Users/padjiman/data/bankVW/\"\n",
	"data_file = \"train.vw\"\n",
	"data = pd.read_csv(directory+data_file, header=None)\n",
	"train, test = train_test_split(data, test_size=0.20 , random_state = 26 )\n",
	"train.to_csv(directory+'split_train.vw', index=False, header=None)\n",
	"test.to_csv(directory+'split_test.vw', index=False, header=None)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Actual train and test"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 128,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"creating features for following interactions: ic \n",
	"final_regressor = model.vw\n",
	"Num weight bits = 26\n",
	"learning rate = 0.5\n",
	"initial_t = 0\n",
	"power_t = 0.5\n",
	"decay_learning_rate = 1\n",
	"using cache_file = split_train.vw.cache\n",
	"ignoring text input in favor of cache input\n",
	"num sources = 1\n",
	"average since example example current current current\n",
	"loss last counter weight label predict features\n",
	"0.693147 0.693147 1 1.0 1.0000 0.0000 60\n",
	"1.109843 1.526538 2 2.0 -1.0000 1.2815 60\n",
	"0.760195 0.410548 4 4.0 -1.0000 -0.2754 60\n",
	"0.465710 0.171224 8 8.0 -1.0000 -0.7350 50\n",
	"0.318235 0.170760 16 16.0 -1.0000 -2.7830 60\n",
	"0.531552 0.744869 32 32.0 -1.0000 -6.1178 60\n",
	"2.579228 4.626905 64 64.0 -1.0000 -3.2457 60\n",
	"2.194169 1.809109 128 128.0 -1.0000 -2.9788 60\n",
	"2.114863 2.035556 256 256.0 -1.0000 -1.4744 60\n",
	"1.616420 1.117978 512 512.0 -1.0000 -1.9029 60\n",
	"1.143370 0.670321 1024 1024.0 -1.0000 -3.5995 60\n",
	"0.766199 0.389027 2048 2048.0 -1.0000 -3.3793 60\n",
	"0.530801 0.295404 4096 4096.0 -1.0000 -4.4217 60\n",
	"0.389938 0.249075 8192 8192.0 -1.0000 -2.8836 60\n",
	"0.318798 0.247658 16384 16384.0 -1.0000 -3.9569 70\n",
	"0.263437 0.263437 32768 32768.0 -1.0000 -4.6966 60 h\n",
	"0.252186 0.240939 65536 65536.0 -1.0000 -4.0019 60 h\n",
	"\n",
	"finished run\n",
	"number of examples per pass = 32552\n",
	"passes used = 4\n",
	"weighted example sum = 130208.000000\n",
	"weighted label sum = -99792.000000\n",
	"average loss = 0.239227 h\n",
	"best constant = -2.023110\n",
	"best constant's loss = 0.360496\n",
	"total feature number = 7949680\n",
	"creating features for following interactions: ic \n",
	"only testing\n",
	"predictions = preds.txt\n",
	"Num weight bits = 26\n",
	"learning rate = 0.5\n",
	"initial_t = 0\n",
	"power_t = 0.5\n",
	"using no cache\n",
	"Reading datafile = split_test.vw\n",
	"num sources = 1\n",
	"average since example example current current current\n",
	"loss last counter weight label predict features\n",
	"1.358600 1.358600 1 1.0 -1.0000 0.1029 60\n",
	"7.592927 13.827254 2 2.0 -1.0000 0.0088 60\n",
	"9.232505 10.872083 4 4.0 1.0000 0.1225 60\n",
	"9.071785 8.911065 8 8.0 -1.0000 0.1698 60\n",
	"9.430997 9.790208 16 16.0 -1.0000 0.0333 60\n",
	"8.353756 7.276516 32 32.0 1.0000 0.2188 60\n",
	"8.469951 8.586146 64 64.0 -1.0000 0.0321 60\n",
	"7.585703 6.701455 128 128.0 -1.0000 0.0199 60\n",
	"7.449527 7.313351 256 256.0 -1.0000 0.2331 60\n",
	"7.174785 6.900042 512 512.0 -1.0000 0.1774 60\n",
	"7.090911 7.007037 1024 1024.0 -1.0000 0.1080 60\n",
	"7.169777 7.248644 2048 2048.0 -1.0000 0.0220 60\n",
	"7.188963 7.208148 4096 4096.0 -1.0000 0.0207 60\n",
	"7.211928 7.234893 8192 8192.0 -1.0000 0.0065 60\n",
	"\n",
	"finished run\n",
	"number of examples per pass = 9043\n",
	"passes used = 1\n",
	"weighted example sum = 9043.000000\n",
	"weighted label sum = -6955.000000\n",
	"average loss = 7.205263\n",
	"best constant = -0.769103\n",
	"best constant's loss = 0.408480\n",
	"total feature number = 552460\n"
	]
	}
	],
	"source": [
	"!cd $directory && vw split_train.vw -c --passes 4 -f model.vw --loss_function logistic --interactions ic -b 26\n",
	"!cd $directory && vw split_test.vw -t -i model.vw -p preds.txt --link logistic"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Calculating the AUC"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 129,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.914318312778\n"
	]
	}
	],
	"source": [
	"preds = pd.read_csv(directory+'preds.txt', header=None)\n",
	"test_split = pd.read_csv(directory+'split_test.vw', header=None, sep = '\|')\n",
	"from sklearn import metrics\n",
	"fpr, tpr, thresholds = metrics.roc_curve(test_split[0].values, preds[0].values)\n",
	"auc = metrics.auc(fpr, tpr)\n",
	"print(auc)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}