arthurwuhoo · June 16, 2016 18:49
diff --git a/Day 13 - CV Exercises Solutions.R b/Day 13 - CV Exercises Solutions.R
 # ------------------------------------------------------------------
 # DAY 12 EXERCISES - CROSS VALIDATION
 # ------------------------------------------------------------------

 # ------------------------------------------------------------------
 # EXERCISE 1
 # Build a Logistic Regression model classifying the credit status of 
 # customers (good or bad) in this data. Without using any packages, apply 5
 # -fold cross-validation on the model. Once you have five models (and five 
 # sets of predicted values), average them to in order to create a new 
 # estimate. How does this new, averaged model compare in MSE against each of
 # the other five models?
 # ------------------------------------------------------------------

 library(RCurl)

 credit <- read.csv("https://raw.githubusercontent.com/gastonstat/CreditScoring/master/CleanCreditScoring.csv")

 credit <- credit[,1:16] #returning only the ones with numeric values
 str(credit) ##good is 2, bad is 1

 #create training and testing data
 train_ind <- createDataPartition(credit$Status, p = 0.8)[[1]]
 credit.train <- credit[train_ind,]
 credit.test <- credit[-train_ind,]

 #making 5 folds out of the training set
 folds <- createFolds(credit.train$Status, k = 5) #createFolds is from the caret package

 fold.1 <- folds[[1]]
 fold.2 <- folds[[2]]
 fold.3 <- folds[[3]]
 fold.4 <- folds[[4]]
 fold.5 <- folds[[5]]

 #making a model for each of the five folds
 model.1 <-  glm(Status ~ ., data = credit.train[fold.1,], family = "binomial")
 model.2 <-  glm(Status ~ ., data = credit.train[fold.2,], family = "binomial")
 model.3 <-  glm(Status ~ ., data = credit.train[fold.3,], family = "binomial")
 model.4 <-  glm(Status ~ ., data = credit.train[fold.4,], family = "binomial")
 model.5 <-  glm(Status ~ ., data = credit.train[fold.5,], family = "binomial")

 #making a prediction for each of the five folds
 pred.1 <- ifelse(predict(model.1, credit.test, type = "response")<0.5, 0, 1)
 pred.2 <- ifelse(predict(model.2, credit.test, type = "response")<0.5, 0, 1)
 pred.3 <- ifelse(predict(model.3, credit.test, type = "response")<0.5, 0, 1)
 pred.4 <- ifelse(predict(model.4, credit.test, type = "response")<0.5, 0, 1)
 pred.5 <- ifelse(predict(model.5, credit.test, type = "response")<0.5, 0, 1)

 #finding the probability (not the 0-1) for each of the five folds in order to take
 #the average
 response.1 <- predict(model.1, credit.test, type = "response")
 response.2 <- predict(model.2, credit.test, type = "response")
 response.3 <- predict(model.3, credit.test, type = "response")
 response.4 <- predict(model.4, credit.test, type = "response")
 response.5 <- predict(model.5, credit.test, type = "response")
 pred.total <- ifelse(((response.1 + response.2 + response.3 + response.4 + response.5)/5)<0.5,0,1)

 accuracy.1 <- sum(diag(prop.table(table(pred.1,credit.test$Status))))
 accuracy.2 <- sum(diag(prop.table(table(pred.2,credit.test$Status))))
 accuracy.3 <- sum(diag(prop.table(table(pred.3,credit.test$Status))))
 accuracy.4 <- sum(diag(prop.table(table(pred.4,credit.test$Status))))
 accuracy.5 <- sum(diag(prop.table(table(pred.5,credit.test$Status))))
 accuracy.total <- sum(diag(prop.table(table(pred.total,credit.test$Status))))
 ##accuracy.total is the CVed model accuracy. now let's find the un-CVed model.

 model.no_cv <-  glm(Status ~ ., data = credit.train, family = "binomial")
 pred.no_cv <- ifelse(predict(model.no_cv, credit.test, type = "response")<0.5, 0, 1)
 accuracy.no_cv <- sum(diag(prop.table(table(pred.no_cv,credit.test$Status)))) #still similar, which is interesting.

 prop.table(table(pred.total,credit.test$Status)) #CV'ed confusion matrix
 prop.table(table(pred.no_cv,credit.test$Status)) #Non-CV'ed confusion matrix


 # ------------------------------------------------------------------
 # EXERCISE 2
 # Redo the previous exercise but with cv.glm() from the boot library.
 # ------------------------------------------------------------------

 library(boot)
 (credit.glm_basic <- glm(Status ~ ., data = credit.train, family = "binomial"))
 credit.cv <- cv.glm(credit.train, credit.glm, K = 5)

 #a little unsure how to feed this into another model.

 # using caret
 library(caret)
 TRAINCONTROL = trainControl(method = "cv", number = 5, verboseIter = TRUE)

 (credit.cv.glm <- train(Status ~ ., data = credit, method = "glm", family = "binomial", trControl = TRAINCONTROL))

 credit.cv.pred <- predict(credit.cv.glm, credit.test)
 accuracy_cv.glm <- sum(diag(prop.table(table(credit.cv.pred, credit.test$Status))))
 #returns a higher accuracy than each of the models or the total model trained
 #in the previous by-hand example.

 # ------------------------------------------------------------------
 # EXERCISE 3
 # Use cv.glm() on the Boston data from the MASS package with medv as the target 
 # variable. Use 10 folds.
 # ------------------------------------------------------------------
 library(MASS)

 str(Boston)

 train_ind <- createDataPartition(Boston$medv, p = 0.8)[[1]]
 b.train <- Boston[train_ind,]
 b.test <- Boston[-train_ind,]

 #let's first select the best variables.

 TRAINCONTROL = trainControl(method = "cv", number = 10, verboseIter = TRUE)

 (b.cv.glm <- train(medv ~ ., data = b.train, method = "glmStepAIC", trControl = TRAINCONTROL)) #r-squared is ~0.75
	# ------------------------------------------------------------------
	# DAY 12 EXERCISES - CROSS VALIDATION
	# ------------------------------------------------------------------

	# ------------------------------------------------------------------
	# EXERCISE 1
	# Build a Logistic Regression model classifying the credit status of
	# customers (good or bad) in this data. Without using any packages, apply 5
	# -fold cross-validation on the model. Once you have five models (and five
	# sets of predicted values), average them to in order to create a new
	# estimate. How does this new, averaged model compare in MSE against each of
	# the other five models?
	# ------------------------------------------------------------------

	library(RCurl)

	credit <- read.csv("https://raw.githubusercontent.com/gastonstat/CreditScoring/master/CleanCreditScoring.csv")

	credit <- credit[,1:16] #returning only the ones with numeric values
	str(credit) ##good is 2, bad is 1

	#create training and testing data
	train_ind <- createDataPartition(credit$Status, p = 0.8)[[1]]
	credit.train <- credit[train_ind,]
	credit.test <- credit[-train_ind,]

	#making 5 folds out of the training set
	folds <- createFolds(credit.train$Status, k = 5) #createFolds is from the caret package

	fold.1 <- folds[[1]]
	fold.2 <- folds[[2]]
	fold.3 <- folds[[3]]
	fold.4 <- folds[[4]]
	fold.5 <- folds[[5]]

	#making a model for each of the five folds
	model.1 <- glm(Status ~ ., data = credit.train[fold.1,], family = "binomial")
	model.2 <- glm(Status ~ ., data = credit.train[fold.2,], family = "binomial")
	model.3 <- glm(Status ~ ., data = credit.train[fold.3,], family = "binomial")
	model.4 <- glm(Status ~ ., data = credit.train[fold.4,], family = "binomial")
	model.5 <- glm(Status ~ ., data = credit.train[fold.5,], family = "binomial")

	#making a prediction for each of the five folds
	pred.1 <- ifelse(predict(model.1, credit.test, type = "response")<0.5, 0, 1)
	pred.2 <- ifelse(predict(model.2, credit.test, type = "response")<0.5, 0, 1)
	pred.3 <- ifelse(predict(model.3, credit.test, type = "response")<0.5, 0, 1)
	pred.4 <- ifelse(predict(model.4, credit.test, type = "response")<0.5, 0, 1)
	pred.5 <- ifelse(predict(model.5, credit.test, type = "response")<0.5, 0, 1)

	#finding the probability (not the 0-1) for each of the five folds in order to take
	#the average
	response.1 <- predict(model.1, credit.test, type = "response")
	response.2 <- predict(model.2, credit.test, type = "response")
	response.3 <- predict(model.3, credit.test, type = "response")
	response.4 <- predict(model.4, credit.test, type = "response")
	response.5 <- predict(model.5, credit.test, type = "response")
	pred.total <- ifelse(((response.1 + response.2 + response.3 + response.4 + response.5)/5)<0.5,0,1)

	accuracy.1 <- sum(diag(prop.table(table(pred.1,credit.test$Status))))
	accuracy.2 <- sum(diag(prop.table(table(pred.2,credit.test$Status))))
	accuracy.3 <- sum(diag(prop.table(table(pred.3,credit.test$Status))))
	accuracy.4 <- sum(diag(prop.table(table(pred.4,credit.test$Status))))
	accuracy.5 <- sum(diag(prop.table(table(pred.5,credit.test$Status))))
	accuracy.total <- sum(diag(prop.table(table(pred.total,credit.test$Status))))
	##accuracy.total is the CVed model accuracy. now let's find the un-CVed model.

	model.no_cv <- glm(Status ~ ., data = credit.train, family = "binomial")
	pred.no_cv <- ifelse(predict(model.no_cv, credit.test, type = "response")<0.5, 0, 1)
	accuracy.no_cv <- sum(diag(prop.table(table(pred.no_cv,credit.test$Status)))) #still similar, which is interesting.

	prop.table(table(pred.total,credit.test$Status)) #CV'ed confusion matrix
	prop.table(table(pred.no_cv,credit.test$Status)) #Non-CV'ed confusion matrix


	# ------------------------------------------------------------------
	# EXERCISE 2
	# Redo the previous exercise but with cv.glm() from the boot library.
	# ------------------------------------------------------------------

	library(boot)
	(credit.glm_basic <- glm(Status ~ ., data = credit.train, family = "binomial"))
	credit.cv <- cv.glm(credit.train, credit.glm, K = 5)

	#a little unsure how to feed this into another model.

	# using caret
	library(caret)
	TRAINCONTROL = trainControl(method = "cv", number = 5, verboseIter = TRUE)

	(credit.cv.glm <- train(Status ~ ., data = credit, method = "glm", family = "binomial", trControl = TRAINCONTROL))

	credit.cv.pred <- predict(credit.cv.glm, credit.test)
	accuracy_cv.glm <- sum(diag(prop.table(table(credit.cv.pred, credit.test$Status))))
	#returns a higher accuracy than each of the models or the total model trained
	#in the previous by-hand example.

	# ------------------------------------------------------------------
	# EXERCISE 3
	# Use cv.glm() on the Boston data from the MASS package with medv as the target
	# variable. Use 10 folds.
	# ------------------------------------------------------------------
	library(MASS)

	str(Boston)

	train_ind <- createDataPartition(Boston$medv, p = 0.8)[[1]]
	b.train <- Boston[train_ind,]
	b.test <- Boston[-train_ind,]

	#let's first select the best variables.

	TRAINCONTROL = trainControl(method = "cv", number = 10, verboseIter = TRUE)

	(b.cv.glm <- train(medv ~ ., data = b.train, method = "glmStepAIC", trControl = TRAINCONTROL)) #r-squared is ~0.75