Created
June 14, 2016 15:40
-
-
Save arthurwuhoo/0aaba6104ebbe3251382e59ed5b1b523 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------------------------------------------------------------------ | |
# DAY 12 EXERCISES - LOGISTIC REGRESSION | |
# ------------------------------------------------------------------ | |
# ------------------------------------------------------------------ | |
# EXERCISE 1 | |
# Create a parsimonious model for the myopia data. Does its performance differ | |
# substantially from the full model? | |
# ------------------------------------------------------------------ | |
#install.packages("aplore3") | |
library(aplore3) | |
str(myopia) | |
#install.packages("caret") | |
library(caret) | |
index = createDataPartition(myopia$myopic, list = FALSE, p = 0.8)[,1] | |
myopia.train = myopia[index,] | |
myopia.test = myopia[-index,] | |
##make the maximum model first | |
myopia.glm <- train(myopic ~ ., data = myopia.train, method = "glm") | |
summary(myopia.glm) #only about 3 of these variables seem to be significant | |
#lets try doing something with stepwise regression | |
#get the new model with stepAIC | |
myopia.step.glm <- train(myopic ~ ., data = myopia.train, method = "glmStepAIC", family = binomial()) | |
##testing our new models | |
confusionMatrix(predict(myopia.glm, myopia.test), myopia.test$myopic, positive = "Yes") #88.62% accuracy | |
confusionMatrix(predict(myopia.step.glm, myopia.test), myopia.test$myopic, positive = "Yes") #86.99% accuracy | |
# ok, so we have lower accuracy with the stepped model, but the stepped | |
# model also has far less variables and could be more practically | |
# effective if doctors just would like to collect less data. | |
# ------------------------------------------------------------------ | |
# EXERCISE 2 | |
# Build a model which predicts the probability of defaulting on credit card debt. | |
# Use the Default data from the ISLR package. Start by trying a simple Linear | |
# Regression mode. Satisfy yourself that this is not going to cut the mustard. Now # try Logistic Regression. | |
# ------------------------------------------------------------------ | |
install.packages("ISLR") | |
library(ISLR) | |
str(Default) | |
index = createDataPartition(Default$default, list = FALSE, p = 0.8)[,1] | |
default.train = Default[index,] | |
default.test = Default[-index,] | |
### CREATING A LINEAR MODEL | |
lm.fit <- lm(default~., default.train) #this actually gives you an error | |
#because R doesn't want you to ever use a linear model on a factor variable. | |
#Let's make R feel uncomfortable by running it on as.numeric(default) | |
default.train$default_numeric <- as.numeric(default.train$default) - 1 | |
default.test$default_numeric <- as.numeric(default.test$default) - 1 | |
# so now 0 = no default and 1 = default | |
lm.fit2 <- lm(default_numeric ~ . - default, default.train) | |
summary(lm.fit2) #looks pretty awful. let's do a logistic instead. | |
lm_predictions <- predict(lm.fit2, default.test) | |
#do a weird conversion to binary? i guess if the lm predicts >0.5 == default | |
## and <0.5 == no default. | |
hist(lm_predictions) #looks like nothing is even above 0.5, so this model | |
#predicts that no one defaults. hm. | |
### CREATING A LOGISTIC MODEL | |
# we don't even need the default_numeric variable anymore because | |
# the wonderful binomial family field takes care of this for us. | |
default.train <- default.train[,-5] #getting rid of default_numeric | |
default.test <- default.test[,-5] | |
default.glm <- train(default ~ ., data = default.train, method = "glm", family = binomial()) | |
confusionMatrix(predict(default.glm, default.test), default.test$default, positive = "Yes") #97.25% accuracy | |
# ------------------------------------------------------------------ | |
# EXERCISE 3 | |
# Use the birthwt data in the MASS package to construct a model for low birth | |
# weight. Are there any features which should be excluded from the model? | |
# ------------------------------------------------------------------ | |
library(MASS) | |
library(caret) | |
library(dplyr) | |
head(birthwt) | |
str(birthwt) #obviously, we have a numeric birthweight variable. | |
#lets get rid of that - otherwise there would be no point in building | |
# a classification model. | |
birthwt <- subset(birthwt, select = -bwt ) | |
## in order for confusionMatrix() to work,we actually need the "low" | |
## variable to be a factor. so let's do that before we split the data | |
## and start training. | |
birthwt$low <- factor(birthwt$low, labels = c("High","Low")) | |
# now split + train | |
index = createDataPartition(birthwt$low, list = FALSE, p = 0.8)[,1] | |
birthwt.train = birthwt[index,] | |
birthwt.test = birthwt[-index,] | |
?birthwt | |
birthwt.glm <- train(low ~ ., data = birthwt.train, method = "glm", family = binomial()) | |
summary(birthwt.glm) #only some of these variables are important. | |
birthwt.glm2 <- train(low ~. , data = birthwt.train, method = "glmStepAIC", family = binomial()) | |
summary(birthwt.glm2) #this says that lwt, race, smoke, ptl, ht, and ui are the most impt. | |
confusionMatrix(predict(birthwt.glm, birthwt.test), birthwt.test$low, positive = "Low") #60% accuracy | |
confusionMatrix(predict(birthwt.glm2, birthwt.test), birthwt.test$low, positive = "Low") #60% accuracy still. It seems like do some better work predicting negatives | |
# ------------------------------------------------------------------ | |
# EXERCISE 4 | |
# Use the this data to explore the dangers of multicollinearity in regression. The | |
# response y measures the heat evolved in calories during the hardening of cement | |
# on a per gram basis. The four predictors are the percentages of four ingredients | |
# : tricalcium aluminate (x1), tricalcium silicate (x2), tetracalcium alumino | |
# ferrite (x3), and dicalcium silicate (x4). Make a linear regression with all | |
# predictors and find their variance inflation factors. | |
# Is there anything troubling? What would severe multicollinearity mean for interpreting the variables? | |
# ------------------------------------------------------------------ | |
library("RCurl") | |
cement <- read.table("https://onlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/cement.txt", skip = 1) | |
#install.packages("corrgram") | |
library(corrgram) #let's use corrgram to explore relationships between variables | |
corr.gram(cement) | |
fit.lm1 <- lm( y~ ., cement) #strangely, no variable is significant | |
# yet this hasa 0.9736 r-squared. this is probably a multicollinearity issue. | |
#check for multicollinearity by doing the following: | |
vif(fit.lm1) #VIFs should never be above 10. these are >>> 10. | |
# as a result, we can't really interpret each of the variables and their | |
# coefficients. we should try removing one and seeing if that solves antything. | |
cor(cement) #it seems like x3<->x1 have a huge correlation and x2<->x4 have a | |
#huge correlation. let's adjust the model by removing x3 and x4. | |
fit.lm2 <- lm( y ~ x1 + x2, cement) | |
summary(fit.lm2) #we have a very similar r^2 and now have significant values | |
# for the predictors. we can now interpret the coefficients correctly. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment