Skip to content

Instantly share code, notes, and snippets.

@arthurwuhoo
Created June 16, 2016 18:49
Show Gist options
  • Save arthurwuhoo/e101bf207137af0bda1765e22089dfa8 to your computer and use it in GitHub Desktop.
Save arthurwuhoo/e101bf207137af0bda1765e22089dfa8 to your computer and use it in GitHub Desktop.
# ------------------------------------------------------------------
# DAY 12 EXERCISES - CROSS VALIDATION
# ------------------------------------------------------------------
# ------------------------------------------------------------------
# EXERCISE 1
# Build a Logistic Regression model classifying the credit status of
# customers (good or bad) in this data. Without using any packages, apply 5
# -fold cross-validation on the model. Once you have five models (and five
# sets of predicted values), average them to in order to create a new
# estimate. How does this new, averaged model compare in MSE against each of
# the other five models?
# ------------------------------------------------------------------
library(RCurl)
credit <- read.csv("https://raw.githubusercontent.com/gastonstat/CreditScoring/master/CleanCreditScoring.csv")
credit <- credit[,1:16] #returning only the ones with numeric values
str(credit) ##good is 2, bad is 1
#create training and testing data
train_ind <- createDataPartition(credit$Status, p = 0.8)[[1]]
credit.train <- credit[train_ind,]
credit.test <- credit[-train_ind,]
#making 5 folds out of the training set
folds <- createFolds(credit.train$Status, k = 5) #createFolds is from the caret package
fold.1 <- folds[[1]]
fold.2 <- folds[[2]]
fold.3 <- folds[[3]]
fold.4 <- folds[[4]]
fold.5 <- folds[[5]]
#making a model for each of the five folds
model.1 <- glm(Status ~ ., data = credit.train[fold.1,], family = "binomial")
model.2 <- glm(Status ~ ., data = credit.train[fold.2,], family = "binomial")
model.3 <- glm(Status ~ ., data = credit.train[fold.3,], family = "binomial")
model.4 <- glm(Status ~ ., data = credit.train[fold.4,], family = "binomial")
model.5 <- glm(Status ~ ., data = credit.train[fold.5,], family = "binomial")
#making a prediction for each of the five folds
pred.1 <- ifelse(predict(model.1, credit.test, type = "response")<0.5, 0, 1)
pred.2 <- ifelse(predict(model.2, credit.test, type = "response")<0.5, 0, 1)
pred.3 <- ifelse(predict(model.3, credit.test, type = "response")<0.5, 0, 1)
pred.4 <- ifelse(predict(model.4, credit.test, type = "response")<0.5, 0, 1)
pred.5 <- ifelse(predict(model.5, credit.test, type = "response")<0.5, 0, 1)
#finding the probability (not the 0-1) for each of the five folds in order to take
#the average
response.1 <- predict(model.1, credit.test, type = "response")
response.2 <- predict(model.2, credit.test, type = "response")
response.3 <- predict(model.3, credit.test, type = "response")
response.4 <- predict(model.4, credit.test, type = "response")
response.5 <- predict(model.5, credit.test, type = "response")
pred.total <- ifelse(((response.1 + response.2 + response.3 + response.4 + response.5)/5)<0.5,0,1)
accuracy.1 <- sum(diag(prop.table(table(pred.1,credit.test$Status))))
accuracy.2 <- sum(diag(prop.table(table(pred.2,credit.test$Status))))
accuracy.3 <- sum(diag(prop.table(table(pred.3,credit.test$Status))))
accuracy.4 <- sum(diag(prop.table(table(pred.4,credit.test$Status))))
accuracy.5 <- sum(diag(prop.table(table(pred.5,credit.test$Status))))
accuracy.total <- sum(diag(prop.table(table(pred.total,credit.test$Status))))
##accuracy.total is the CVed model accuracy. now let's find the un-CVed model.
model.no_cv <- glm(Status ~ ., data = credit.train, family = "binomial")
pred.no_cv <- ifelse(predict(model.no_cv, credit.test, type = "response")<0.5, 0, 1)
accuracy.no_cv <- sum(diag(prop.table(table(pred.no_cv,credit.test$Status)))) #still similar, which is interesting.
prop.table(table(pred.total,credit.test$Status)) #CV'ed confusion matrix
prop.table(table(pred.no_cv,credit.test$Status)) #Non-CV'ed confusion matrix
# ------------------------------------------------------------------
# EXERCISE 2
# Redo the previous exercise but with cv.glm() from the boot library.
# ------------------------------------------------------------------
library(boot)
(credit.glm_basic <- glm(Status ~ ., data = credit.train, family = "binomial"))
credit.cv <- cv.glm(credit.train, credit.glm, K = 5)
#a little unsure how to feed this into another model.
# using caret
library(caret)
TRAINCONTROL = trainControl(method = "cv", number = 5, verboseIter = TRUE)
(credit.cv.glm <- train(Status ~ ., data = credit, method = "glm", family = "binomial", trControl = TRAINCONTROL))
credit.cv.pred <- predict(credit.cv.glm, credit.test)
accuracy_cv.glm <- sum(diag(prop.table(table(credit.cv.pred, credit.test$Status))))
#returns a higher accuracy than each of the models or the total model trained
#in the previous by-hand example.
# ------------------------------------------------------------------
# EXERCISE 3
# Use cv.glm() on the Boston data from the MASS package with medv as the target
# variable. Use 10 folds.
# ------------------------------------------------------------------
library(MASS)
str(Boston)
train_ind <- createDataPartition(Boston$medv, p = 0.8)[[1]]
b.train <- Boston[train_ind,]
b.test <- Boston[-train_ind,]
#let's first select the best variables.
TRAINCONTROL = trainControl(method = "cv", number = 10, verboseIter = TRUE)
(b.cv.glm <- train(medv ~ ., data = b.train, method = "glmStepAIC", trControl = TRAINCONTROL)) #r-squared is ~0.75
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment