Examination of a simple model for the UCI bank data set

# for auc()
> library(pROC)

# for performance plots
> library(ROCR)
Loading required package: gplots
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009

Attaching package: ‘gplots’

The following object is masked from ‘package:stats’:

    lowess
    
    
# read the data
> bank = read.delim(file="/Users/tdunning/tmp/mahout-tmp/bank-full.csv", sep=';')

# check it out
> dim(bank)
[1] 45211    17
> summary(bank)
      age                 job           marital          education     default    
 Min.   :18.00   blue-collar:9732   divorced: 5207   primary  : 6851   no :44396  
 1st Qu.:33.00   management :9458   married :27214   secondary:23202   yes:  815  
 Median :39.00   technician :7597   single  :12790   tertiary :13301              
 Mean   :40.94   admin.     :5171                    unknown  : 1857              
 3rd Qu.:48.00   services   :4154                                                 
 Max.   :95.00   retired    :2264                                                 
                 (Other)    :6835                                                 
    balance       housing      loan            contact           day            month      
 Min.   : -8019   no :20081   no :37967   cellular :29285   Min.   : 1.00   may    :13766  
 1st Qu.:    72   yes:25130   yes: 7244   telephone: 2906   1st Qu.: 8.00   jul    : 6895  
 Median :   448                           unknown  :13020   Median :16.00   aug    : 6247  
 Mean   :  1362                                             Mean   :15.81   jun    : 5341  
 3rd Qu.:  1428                                             3rd Qu.:21.00   nov    : 3970  
 Max.   :102127                                             Max.   :31.00   apr    : 2932  
                                                                            (Other): 6060  
    duration         campaign          pdays          previous           poutcome    
 Min.   :   0.0   Min.   : 1.000   Min.   : -1.0   Min.   :  0.0000   failure: 4901  
 1st Qu.: 103.0   1st Qu.: 1.000   1st Qu.: -1.0   1st Qu.:  0.0000   other  : 1840  
 Median : 180.0   Median : 2.000   Median : -1.0   Median :  0.0000   success: 1511  
 Mean   : 258.2   Mean   : 2.764   Mean   : 40.2   Mean   :  0.5803   unknown:36959  
 3rd Qu.: 319.0   3rd Qu.: 3.000   3rd Qu.: -1.0   3rd Qu.:  0.0000                  
 Max.   :4918.0   Max.   :63.000   Max.   :871.0   Max.   :275.0000                  
                                                                                     
   y        
 no :39922  
 yes: 5289  

# split training and test data.  One split is enough as we see later
> train = runif(45211) < 0.9
> test = !train
> mean(train)
[1] 0.8994271
> mean(test)
[1] 0.1005729

# build a logistic regression model
> m = glm(y ~ age + job + marital + education + default + balance + housing + loan + contact + day + month + duration + campaign + pdays + previous + poutcome, bank[train,], family='binomial')

# now evaluate.  Should get AUC > 0.9
> auc(bank[test,]$y, predict(m, new=bank[test,]))
Area under the curve: 0.9018

# plot ROC curves for test and training data.  They are right on top of each other
> plot(roc(bank[test,]$y, predict(m, new=bank[test,])))

# this second plot takes forever
> lines(roc(bank[train,]$y, predict(m, new=bank[train,])), col='red')

# do some simple confusion tables
> summary(predict(m, new=bank[test,]))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -9.675  -3.869  -3.061  -2.794  -2.004   8.743 

> table(bank[test,]$y, predict(m, new=bank[test,])>0)
     
      FALSE TRUE
  no   3898  117
  yes   354  178

> table(bank[test,]$y, predict(m, new=bank[test,])>-1)
     
      FALSE TRUE
  no   3753  262
  yes   208  324
> table(bank[test,]$y, predict(m, new=bank[test,])>-2)
     
      FALSE TRUE
  no   3329  686
  yes    84  448
> table(bank[test,]$y, predict(m, new=bank[test,])>-3)
     
      FALSE TRUE
  no   2338 1677
  yes    19  513



# plot F ratio, note sharp optimum
> plot(performance(prediction(predict(m, new=bank[test,]), bank[test,]$y), "f"))

# plot accuracy
> plot(performance(prediction(predict(m, new=bank[test,]), bank[test,]$y), "acc"))

# zoom in and plot training data, too
> plot(performance(prediction(predict(m, new=bank[test,]), bank[test,]$y), "acc"), xlim=c(-3,2), ylim=c(0.8,1))
> plot(performance(prediction(predict(m, new=bank[train,]), bank[train,]$y), "acc"), xlim=c(-3,2), ylim=c(0.8,1), add=T, col='red')

# here is ROC curve again in different words
> plot(performance(prediction(predict(m, new=bank[test,]), bank[test,]$y), "tpr", "fpr"))

tdunning/bank.md