arthurwuhoo · June 14, 2016 15:40
diff --git a/Day 12: Logistic Regression Exercises.R b/Day 12: Logistic Regression Exercises.R
 # ------------------------------------------------------------------
 # DAY 12 EXERCISES - LOGISTIC REGRESSION
 # ------------------------------------------------------------------

 # ------------------------------------------------------------------
 # EXERCISE 1
 # Create a parsimonious model for the myopia data. Does its performance differ 
 # substantially from the full model?
 # ------------------------------------------------------------------

 #install.packages("aplore3")
 library(aplore3)
 str(myopia)

 #install.packages("caret")
 library(caret)

 index = createDataPartition(myopia$myopic, list = FALSE, p = 0.8)[,1]
 myopia.train = myopia[index,]
 myopia.test = myopia[-index,]

 ##make the maximum model first
 myopia.glm <- train(myopic ~ ., data = myopia.train, method = "glm")
 summary(myopia.glm) #only about 3 of these variables seem to be significant
 #lets try doing something with stepwise regression


 #get the new model with stepAIC
 myopia.step.glm <- train(myopic ~ ., data = myopia.train, method = "glmStepAIC", family = binomial())

 ##testing our new models

 confusionMatrix(predict(myopia.glm, myopia.test), myopia.test$myopic, positive = "Yes") #88.62% accuracy

 confusionMatrix(predict(myopia.step.glm, myopia.test), myopia.test$myopic, positive = "Yes") #86.99% accuracy

 # ok, so we have lower accuracy with the stepped model, but the stepped
 # model also has far less variables and could be more practically
 # effective if doctors just would like to collect less data.


 # ------------------------------------------------------------------
 # EXERCISE 2
 # Build a model which predicts the probability of defaulting on credit card debt. 
 # Use the Default data from the ISLR package. Start by trying a simple Linear 
 # Regression mode. Satisfy yourself that this is not going to cut the mustard. Now # try Logistic Regression.
 # ------------------------------------------------------------------

 install.packages("ISLR")
 library(ISLR)

 str(Default)

 index = createDataPartition(Default$default, list = FALSE, p = 0.8)[,1]
 default.train = Default[index,]
 default.test = Default[-index,]

 ### CREATING A LINEAR MODEL

 lm.fit <- lm(default~., default.train) #this actually gives you an error
 #because R doesn't want you to ever use a linear model on a factor variable.
 #Let's make R feel uncomfortable by running it on as.numeric(default)

 default.train$default_numeric <- as.numeric(default.train$default) - 1 
 default.test$default_numeric <- as.numeric(default.test$default) - 1
 # so now 0 = no default and 1 = default

 lm.fit2 <- lm(default_numeric ~ . - default, default.train)
 summary(lm.fit2) #looks pretty awful. let's do a logistic instead.
 lm_predictions <- predict(lm.fit2, default.test)

 #do a weird conversion to binary? i guess if the lm predicts >0.5 == default
 ## and <0.5 == no default.
 hist(lm_predictions) #looks like nothing is even above 0.5, so this model
 #predicts that no one defaults. hm.

 ### CREATING A LOGISTIC MODEL
 # we don't even need the default_numeric variable anymore because
 # the wonderful binomial family field takes care of this for us.

 default.train <- default.train[,-5] #getting rid of default_numeric
 default.test <- default.test[,-5]

 default.glm <- train(default ~ ., data = default.train, method = "glm", family = binomial())

 confusionMatrix(predict(default.glm, default.test), default.test$default, positive = "Yes") #97.25% accuracy


 # ------------------------------------------------------------------
 # EXERCISE 3
 # Use the birthwt data in the MASS package to construct a model for low birth 
 # weight. Are there any features which should be excluded from the model?
 # ------------------------------------------------------------------

 library(MASS)
 library(caret)
 library(dplyr)

 head(birthwt)
 str(birthwt) #obviously, we have a numeric birthweight variable.
 #lets get rid of that - otherwise there would be no point in building
 # a classification model.

 birthwt <- subset(birthwt, select = -bwt )

 ## in order for confusionMatrix() to work,we actually need the "low"
 ## variable to be a factor. so let's do that before we split the data
 ## and start training.

 birthwt$low <- factor(birthwt$low, labels = c("High","Low"))

 # now split + train
 index = createDataPartition(birthwt$low, list = FALSE, p = 0.8)[,1]
 birthwt.train = birthwt[index,]
 birthwt.test = birthwt[-index,]

 ?birthwt

 birthwt.glm <- train(low ~ ., data = birthwt.train, method = "glm", family = binomial())
 summary(birthwt.glm) #only some of these variables are important.

 birthwt.glm2 <- train(low ~. , data = birthwt.train, method = "glmStepAIC", family = binomial())
 summary(birthwt.glm2) #this says that lwt, race, smoke, ptl, ht, and ui are the most impt.

 confusionMatrix(predict(birthwt.glm, birthwt.test), birthwt.test$low, positive = "Low") #60% accuracy

 confusionMatrix(predict(birthwt.glm2, birthwt.test), birthwt.test$low, positive = "Low") #60% accuracy still. It seems like do some better work predicting negatives

 # ------------------------------------------------------------------
 # EXERCISE 4
 # Use the this data to explore the dangers of multicollinearity in regression. The
 # response y measures the heat evolved in calories during the hardening of cement 
 # on a per gram basis. The four predictors are the percentages of four ingredients
 # : tricalcium aluminate (x1), tricalcium silicate (x2), tetracalcium alumino 
 # ferrite (x3), and dicalcium silicate (x4). Make a linear regression with all 
 # predictors and find their variance inflation factors.

 # Is there anything troubling? What would severe multicollinearity mean for interpreting the variables?
 # ------------------------------------------------------------------

 library("RCurl")

 cement <- read.table("https://onlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/cement.txt", skip = 1)

 #install.packages("corrgram")
 library(corrgram) #let's use corrgram to explore relationships between variables
 corr.gram(cement)

 fit.lm1 <- lm( y~ ., cement) #strangely, no variable is significant
 # yet this hasa 0.9736 r-squared. this is probably a multicollinearity issue.
 #check for multicollinearity by doing the following:

 vif(fit.lm1) #VIFs should never be above 10. these are >>> 10.
 # as a result, we can't really interpret each of the variables and their
 # coefficients. we should try removing one and seeing if that solves antything.


 cor(cement) #it seems like x3<->x1 have a huge correlation and x2<->x4 have a
 #huge correlation. let's adjust the model by removing x3 and x4.

 fit.lm2 <- lm( y ~ x1 + x2, cement)
 summary(fit.lm2) #we have a very similar r^2 and now have significant values
 # for the predictors. we can now interpret the coefficients correctly.
	# ------------------------------------------------------------------
	# DAY 12 EXERCISES - LOGISTIC REGRESSION
	# ------------------------------------------------------------------

	# ------------------------------------------------------------------
	# EXERCISE 1
	# Create a parsimonious model for the myopia data. Does its performance differ
	# substantially from the full model?
	# ------------------------------------------------------------------

	#install.packages("aplore3")
	library(aplore3)
	str(myopia)

	#install.packages("caret")
	library(caret)

	index = createDataPartition(myopia$myopic, list = FALSE, p = 0.8)[,1]
	myopia.train = myopia[index,]
	myopia.test = myopia[-index,]

	##make the maximum model first
	myopia.glm <- train(myopic ~ ., data = myopia.train, method = "glm")
	summary(myopia.glm) #only about 3 of these variables seem to be significant
	#lets try doing something with stepwise regression


	#get the new model with stepAIC
	myopia.step.glm <- train(myopic ~ ., data = myopia.train, method = "glmStepAIC", family = binomial())

	##testing our new models

	confusionMatrix(predict(myopia.glm, myopia.test), myopia.test$myopic, positive = "Yes") #88.62% accuracy

	confusionMatrix(predict(myopia.step.glm, myopia.test), myopia.test$myopic, positive = "Yes") #86.99% accuracy

	# ok, so we have lower accuracy with the stepped model, but the stepped
	# model also has far less variables and could be more practically
	# effective if doctors just would like to collect less data.


	# ------------------------------------------------------------------
	# EXERCISE 2
	# Build a model which predicts the probability of defaulting on credit card debt.
	# Use the Default data from the ISLR package. Start by trying a simple Linear
	# Regression mode. Satisfy yourself that this is not going to cut the mustard. Now # try Logistic Regression.
	# ------------------------------------------------------------------

	install.packages("ISLR")
	library(ISLR)

	str(Default)

	index = createDataPartition(Default$default, list = FALSE, p = 0.8)[,1]
	default.train = Default[index,]
	default.test = Default[-index,]

	### CREATING A LINEAR MODEL

	lm.fit <- lm(default~., default.train) #this actually gives you an error
	#because R doesn't want you to ever use a linear model on a factor variable.
	#Let's make R feel uncomfortable by running it on as.numeric(default)

	default.train$default_numeric <- as.numeric(default.train$default) - 1
	default.test$default_numeric <- as.numeric(default.test$default) - 1
	# so now 0 = no default and 1 = default

	lm.fit2 <- lm(default_numeric ~ . - default, default.train)
	summary(lm.fit2) #looks pretty awful. let's do a logistic instead.
	lm_predictions <- predict(lm.fit2, default.test)

	#do a weird conversion to binary? i guess if the lm predicts >0.5 == default
	## and <0.5 == no default.
	hist(lm_predictions) #looks like nothing is even above 0.5, so this model
	#predicts that no one defaults. hm.

	### CREATING A LOGISTIC MODEL
	# we don't even need the default_numeric variable anymore because
	# the wonderful binomial family field takes care of this for us.

	default.train <- default.train[,-5] #getting rid of default_numeric
	default.test <- default.test[,-5]

	default.glm <- train(default ~ ., data = default.train, method = "glm", family = binomial())

	confusionMatrix(predict(default.glm, default.test), default.test$default, positive = "Yes") #97.25% accuracy


	# ------------------------------------------------------------------
	# EXERCISE 3
	# Use the birthwt data in the MASS package to construct a model for low birth
	# weight. Are there any features which should be excluded from the model?
	# ------------------------------------------------------------------

	library(MASS)
	library(caret)
	library(dplyr)

	head(birthwt)
	str(birthwt) #obviously, we have a numeric birthweight variable.
	#lets get rid of that - otherwise there would be no point in building
	# a classification model.

	birthwt <- subset(birthwt, select = -bwt )

	## in order for confusionMatrix() to work,we actually need the "low"
	## variable to be a factor. so let's do that before we split the data
	## and start training.

	birthwt$low <- factor(birthwt$low, labels = c("High","Low"))

	# now split + train
	index = createDataPartition(birthwt$low, list = FALSE, p = 0.8)[,1]
	birthwt.train = birthwt[index,]
	birthwt.test = birthwt[-index,]

	?birthwt

	birthwt.glm <- train(low ~ ., data = birthwt.train, method = "glm", family = binomial())
	summary(birthwt.glm) #only some of these variables are important.

	birthwt.glm2 <- train(low ~. , data = birthwt.train, method = "glmStepAIC", family = binomial())
	summary(birthwt.glm2) #this says that lwt, race, smoke, ptl, ht, and ui are the most impt.

	confusionMatrix(predict(birthwt.glm, birthwt.test), birthwt.test$low, positive = "Low") #60% accuracy

	confusionMatrix(predict(birthwt.glm2, birthwt.test), birthwt.test$low, positive = "Low") #60% accuracy still. It seems like do some better work predicting negatives

	# ------------------------------------------------------------------
	# EXERCISE 4
	# Use the this data to explore the dangers of multicollinearity in regression. The
	# response y measures the heat evolved in calories during the hardening of cement
	# on a per gram basis. The four predictors are the percentages of four ingredients
	# : tricalcium aluminate (x1), tricalcium silicate (x2), tetracalcium alumino
	# ferrite (x3), and dicalcium silicate (x4). Make a linear regression with all
	# predictors and find their variance inflation factors.

	# Is there anything troubling? What would severe multicollinearity mean for interpreting the variables?
	# ------------------------------------------------------------------

	library("RCurl")

	cement <- read.table("https://onlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/cement.txt", skip = 1)

	#install.packages("corrgram")
	library(corrgram) #let's use corrgram to explore relationships between variables
	corr.gram(cement)

	fit.lm1 <- lm( y~ ., cement) #strangely, no variable is significant
	# yet this hasa 0.9736 r-squared. this is probably a multicollinearity issue.
	#check for multicollinearity by doing the following:

	vif(fit.lm1) #VIFs should never be above 10. these are >>> 10.
	# as a result, we can't really interpret each of the variables and their
	# coefficients. we should try removing one and seeing if that solves antything.


	cor(cement) #it seems like x3<->x1 have a huge correlation and x2<->x4 have a
	#huge correlation. let's adjust the model by removing x3 and x4.

	fit.lm2 <- lm( y ~ x1 + x2, cement)
	summary(fit.lm2) #we have a very similar r^2 and now have significant values
	# for the predictors. we can now interpret the coefficients correctly.