Created
June 16, 2016 18:49
-
-
Save arthurwuhoo/e101bf207137af0bda1765e22089dfa8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------------------------------------------------------------------ | |
# DAY 12 EXERCISES - CROSS VALIDATION | |
# ------------------------------------------------------------------ | |
# ------------------------------------------------------------------ | |
# EXERCISE 1 | |
# Build a Logistic Regression model classifying the credit status of | |
# customers (good or bad) in this data. Without using any packages, apply 5 | |
# -fold cross-validation on the model. Once you have five models (and five | |
# sets of predicted values), average them to in order to create a new | |
# estimate. How does this new, averaged model compare in MSE against each of | |
# the other five models? | |
# ------------------------------------------------------------------ | |
library(RCurl) | |
credit <- read.csv("https://raw.githubusercontent.com/gastonstat/CreditScoring/master/CleanCreditScoring.csv") | |
credit <- credit[,1:16] #returning only the ones with numeric values | |
str(credit) ##good is 2, bad is 1 | |
#create training and testing data | |
train_ind <- createDataPartition(credit$Status, p = 0.8)[[1]] | |
credit.train <- credit[train_ind,] | |
credit.test <- credit[-train_ind,] | |
#making 5 folds out of the training set | |
folds <- createFolds(credit.train$Status, k = 5) #createFolds is from the caret package | |
fold.1 <- folds[[1]] | |
fold.2 <- folds[[2]] | |
fold.3 <- folds[[3]] | |
fold.4 <- folds[[4]] | |
fold.5 <- folds[[5]] | |
#making a model for each of the five folds | |
model.1 <- glm(Status ~ ., data = credit.train[fold.1,], family = "binomial") | |
model.2 <- glm(Status ~ ., data = credit.train[fold.2,], family = "binomial") | |
model.3 <- glm(Status ~ ., data = credit.train[fold.3,], family = "binomial") | |
model.4 <- glm(Status ~ ., data = credit.train[fold.4,], family = "binomial") | |
model.5 <- glm(Status ~ ., data = credit.train[fold.5,], family = "binomial") | |
#making a prediction for each of the five folds | |
pred.1 <- ifelse(predict(model.1, credit.test, type = "response")<0.5, 0, 1) | |
pred.2 <- ifelse(predict(model.2, credit.test, type = "response")<0.5, 0, 1) | |
pred.3 <- ifelse(predict(model.3, credit.test, type = "response")<0.5, 0, 1) | |
pred.4 <- ifelse(predict(model.4, credit.test, type = "response")<0.5, 0, 1) | |
pred.5 <- ifelse(predict(model.5, credit.test, type = "response")<0.5, 0, 1) | |
#finding the probability (not the 0-1) for each of the five folds in order to take | |
#the average | |
response.1 <- predict(model.1, credit.test, type = "response") | |
response.2 <- predict(model.2, credit.test, type = "response") | |
response.3 <- predict(model.3, credit.test, type = "response") | |
response.4 <- predict(model.4, credit.test, type = "response") | |
response.5 <- predict(model.5, credit.test, type = "response") | |
pred.total <- ifelse(((response.1 + response.2 + response.3 + response.4 + response.5)/5)<0.5,0,1) | |
accuracy.1 <- sum(diag(prop.table(table(pred.1,credit.test$Status)))) | |
accuracy.2 <- sum(diag(prop.table(table(pred.2,credit.test$Status)))) | |
accuracy.3 <- sum(diag(prop.table(table(pred.3,credit.test$Status)))) | |
accuracy.4 <- sum(diag(prop.table(table(pred.4,credit.test$Status)))) | |
accuracy.5 <- sum(diag(prop.table(table(pred.5,credit.test$Status)))) | |
accuracy.total <- sum(diag(prop.table(table(pred.total,credit.test$Status)))) | |
##accuracy.total is the CVed model accuracy. now let's find the un-CVed model. | |
model.no_cv <- glm(Status ~ ., data = credit.train, family = "binomial") | |
pred.no_cv <- ifelse(predict(model.no_cv, credit.test, type = "response")<0.5, 0, 1) | |
accuracy.no_cv <- sum(diag(prop.table(table(pred.no_cv,credit.test$Status)))) #still similar, which is interesting. | |
prop.table(table(pred.total,credit.test$Status)) #CV'ed confusion matrix | |
prop.table(table(pred.no_cv,credit.test$Status)) #Non-CV'ed confusion matrix | |
# ------------------------------------------------------------------ | |
# EXERCISE 2 | |
# Redo the previous exercise but with cv.glm() from the boot library. | |
# ------------------------------------------------------------------ | |
library(boot) | |
(credit.glm_basic <- glm(Status ~ ., data = credit.train, family = "binomial")) | |
credit.cv <- cv.glm(credit.train, credit.glm, K = 5) | |
#a little unsure how to feed this into another model. | |
# using caret | |
library(caret) | |
TRAINCONTROL = trainControl(method = "cv", number = 5, verboseIter = TRUE) | |
(credit.cv.glm <- train(Status ~ ., data = credit, method = "glm", family = "binomial", trControl = TRAINCONTROL)) | |
credit.cv.pred <- predict(credit.cv.glm, credit.test) | |
accuracy_cv.glm <- sum(diag(prop.table(table(credit.cv.pred, credit.test$Status)))) | |
#returns a higher accuracy than each of the models or the total model trained | |
#in the previous by-hand example. | |
# ------------------------------------------------------------------ | |
# EXERCISE 3 | |
# Use cv.glm() on the Boston data from the MASS package with medv as the target | |
# variable. Use 10 folds. | |
# ------------------------------------------------------------------ | |
library(MASS) | |
str(Boston) | |
train_ind <- createDataPartition(Boston$medv, p = 0.8)[[1]] | |
b.train <- Boston[train_ind,] | |
b.test <- Boston[-train_ind,] | |
#let's first select the best variables. | |
TRAINCONTROL = trainControl(method = "cv", number = 10, verboseIter = TRUE) | |
(b.cv.glm <- train(medv ~ ., data = b.train, method = "glmStepAIC", trControl = TRAINCONTROL)) #r-squared is ~0.75 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment