# for auc()
> library(pROC)
# for performance plots
> library(ROCR)
Loading required package: gplots
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009
Attaching package: ‘gplots’
The following object is masked from ‘package:stats’:
lowess
# read the data
> bank = read.delim(file="/Users/tdunning/tmp/mahout-tmp/bank-full.csv", sep=';')
# check it out
> dim(bank)
[1] 45211 17
> summary(bank)
age job marital education default
Min. :18.00 blue-collar:9732 divorced: 5207 primary : 6851 no :44396
1st Qu.:33.00 management :9458 married :27214 secondary:23202 yes: 815
Median :39.00 technician :7597 single :12790 tertiary :13301
Mean :40.94 admin. :5171 unknown : 1857
3rd Qu.:48.00 services :4154
Max. :95.00 retired :2264
(Other) :6835
balance housing loan contact day month
Min. : -8019 no :20081 no :37967 cellular :29285 Min. : 1.00 may :13766
1st Qu.: 72 yes:25130 yes: 7244 telephone: 2906 1st Qu.: 8.00 jul : 6895
Median : 448 unknown :13020 Median :16.00 aug : 6247
Mean : 1362 Mean :15.81 jun : 5341
3rd Qu.: 1428 3rd Qu.:21.00 nov : 3970
Max. :102127 Max. :31.00 apr : 2932
(Other): 6060
duration campaign pdays previous poutcome
Min. : 0.0 Min. : 1.000 Min. : -1.0 Min. : 0.0000 failure: 4901
1st Qu.: 103.0 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 other : 1840
Median : 180.0 Median : 2.000 Median : -1.0 Median : 0.0000 success: 1511
Mean : 258.2 Mean : 2.764 Mean : 40.2 Mean : 0.5803 unknown:36959
3rd Qu.: 319.0 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
Max. :4918.0 Max. :63.000 Max. :871.0 Max. :275.0000
y
no :39922
yes: 5289
# split training and test data. One split is enough as we see later
> train = runif(45211) < 0.9
> test = !train
> mean(train)
[1] 0.8994271
> mean(test)
[1] 0.1005729
# build a logistic regression model
> m = glm(y ~ age + job + marital + education + default + balance + housing + loan + contact + day + month + duration + campaign + pdays + previous + poutcome, bank[train,], family='binomial')
# now evaluate. Should get AUC > 0.9
> auc(bank[test,]$y, predict(m, new=bank[test,]))
Area under the curve: 0.9018
# plot ROC curves for test and training data. They are right on top of each other
> plot(roc(bank[test,]$y, predict(m, new=bank[test,])))
# this second plot takes forever
> lines(roc(bank[train,]$y, predict(m, new=bank[train,])), col='red')

# do some simple confusion tables
> summary(predict(m, new=bank[test,]))
Min. 1st Qu. Median Mean 3rd Qu. Max.
-9.675 -3.869 -3.061 -2.794 -2.004 8.743
> table(bank[test,]$y, predict(m, new=bank[test,])>0)
FALSE TRUE
no 3898 117
yes 354 178
> table(bank[test,]$y, predict(m, new=bank[test,])>-1)
FALSE TRUE
no 3753 262
yes 208 324
> table(bank[test,]$y, predict(m, new=bank[test,])>-2)
FALSE TRUE
no 3329 686
yes 84 448
> table(bank[test,]$y, predict(m, new=bank[test,])>-3)
FALSE TRUE
no 2338 1677
yes 19 513
# plot F ratio, note sharp optimum
> plot(performance(prediction(predict(m, new=bank[test,]), bank[test,]$y), "f"))

# plot accuracy
> plot(performance(prediction(predict(m, new=bank[test,]), bank[test,]$y), "acc"))
# zoom in and plot training data, too
> plot(performance(prediction(predict(m, new=bank[test,]), bank[test,]$y), "acc"), xlim=c(-3,2), ylim=c(0.8,1))
> plot(performance(prediction(predict(m, new=bank[train,]), bank[train,]$y), "acc"), xlim=c(-3,2), ylim=c(0.8,1), add=T, col='red')

# here is ROC curve again in different words
> plot(performance(prediction(predict(m, new=bank[test,]), bank[test,]$y), "tpr", "fpr"))