Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save arthurwuhoo/0aaba6104ebbe3251382e59ed5b1b523 to your computer and use it in GitHub Desktop.
Save arthurwuhoo/0aaba6104ebbe3251382e59ed5b1b523 to your computer and use it in GitHub Desktop.
# ------------------------------------------------------------------
# DAY 12 EXERCISES - LOGISTIC REGRESSION
# ------------------------------------------------------------------
# ------------------------------------------------------------------
# EXERCISE 1
# Create a parsimonious model for the myopia data. Does its performance differ
# substantially from the full model?
# ------------------------------------------------------------------
#install.packages("aplore3")
library(aplore3)
str(myopia)
#install.packages("caret")
library(caret)
index = createDataPartition(myopia$myopic, list = FALSE, p = 0.8)[,1]
myopia.train = myopia[index,]
myopia.test = myopia[-index,]
##make the maximum model first
myopia.glm <- train(myopic ~ ., data = myopia.train, method = "glm")
summary(myopia.glm) #only about 3 of these variables seem to be significant
#lets try doing something with stepwise regression
#get the new model with stepAIC
myopia.step.glm <- train(myopic ~ ., data = myopia.train, method = "glmStepAIC", family = binomial())
##testing our new models
confusionMatrix(predict(myopia.glm, myopia.test), myopia.test$myopic, positive = "Yes") #88.62% accuracy
confusionMatrix(predict(myopia.step.glm, myopia.test), myopia.test$myopic, positive = "Yes") #86.99% accuracy
# ok, so we have lower accuracy with the stepped model, but the stepped
# model also has far less variables and could be more practically
# effective if doctors just would like to collect less data.
# ------------------------------------------------------------------
# EXERCISE 2
# Build a model which predicts the probability of defaulting on credit card debt.
# Use the Default data from the ISLR package. Start by trying a simple Linear
# Regression mode. Satisfy yourself that this is not going to cut the mustard. Now # try Logistic Regression.
# ------------------------------------------------------------------
install.packages("ISLR")
library(ISLR)
str(Default)
index = createDataPartition(Default$default, list = FALSE, p = 0.8)[,1]
default.train = Default[index,]
default.test = Default[-index,]
### CREATING A LINEAR MODEL
lm.fit <- lm(default~., default.train) #this actually gives you an error
#because R doesn't want you to ever use a linear model on a factor variable.
#Let's make R feel uncomfortable by running it on as.numeric(default)
default.train$default_numeric <- as.numeric(default.train$default) - 1
default.test$default_numeric <- as.numeric(default.test$default) - 1
# so now 0 = no default and 1 = default
lm.fit2 <- lm(default_numeric ~ . - default, default.train)
summary(lm.fit2) #looks pretty awful. let's do a logistic instead.
lm_predictions <- predict(lm.fit2, default.test)
#do a weird conversion to binary? i guess if the lm predicts >0.5 == default
## and <0.5 == no default.
hist(lm_predictions) #looks like nothing is even above 0.5, so this model
#predicts that no one defaults. hm.
### CREATING A LOGISTIC MODEL
# we don't even need the default_numeric variable anymore because
# the wonderful binomial family field takes care of this for us.
default.train <- default.train[,-5] #getting rid of default_numeric
default.test <- default.test[,-5]
default.glm <- train(default ~ ., data = default.train, method = "glm", family = binomial())
confusionMatrix(predict(default.glm, default.test), default.test$default, positive = "Yes") #97.25% accuracy
# ------------------------------------------------------------------
# EXERCISE 3
# Use the birthwt data in the MASS package to construct a model for low birth
# weight. Are there any features which should be excluded from the model?
# ------------------------------------------------------------------
library(MASS)
library(caret)
library(dplyr)
head(birthwt)
str(birthwt) #obviously, we have a numeric birthweight variable.
#lets get rid of that - otherwise there would be no point in building
# a classification model.
birthwt <- subset(birthwt, select = -bwt )
## in order for confusionMatrix() to work,we actually need the "low"
## variable to be a factor. so let's do that before we split the data
## and start training.
birthwt$low <- factor(birthwt$low, labels = c("High","Low"))
# now split + train
index = createDataPartition(birthwt$low, list = FALSE, p = 0.8)[,1]
birthwt.train = birthwt[index,]
birthwt.test = birthwt[-index,]
?birthwt
birthwt.glm <- train(low ~ ., data = birthwt.train, method = "glm", family = binomial())
summary(birthwt.glm) #only some of these variables are important.
birthwt.glm2 <- train(low ~. , data = birthwt.train, method = "glmStepAIC", family = binomial())
summary(birthwt.glm2) #this says that lwt, race, smoke, ptl, ht, and ui are the most impt.
confusionMatrix(predict(birthwt.glm, birthwt.test), birthwt.test$low, positive = "Low") #60% accuracy
confusionMatrix(predict(birthwt.glm2, birthwt.test), birthwt.test$low, positive = "Low") #60% accuracy still. It seems like do some better work predicting negatives
# ------------------------------------------------------------------
# EXERCISE 4
# Use the this data to explore the dangers of multicollinearity in regression. The
# response y measures the heat evolved in calories during the hardening of cement
# on a per gram basis. The four predictors are the percentages of four ingredients
# : tricalcium aluminate (x1), tricalcium silicate (x2), tetracalcium alumino
# ferrite (x3), and dicalcium silicate (x4). Make a linear regression with all
# predictors and find their variance inflation factors.
# Is there anything troubling? What would severe multicollinearity mean for interpreting the variables?
# ------------------------------------------------------------------
library("RCurl")
cement <- read.table("https://onlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/cement.txt", skip = 1)
#install.packages("corrgram")
library(corrgram) #let's use corrgram to explore relationships between variables
corr.gram(cement)
fit.lm1 <- lm( y~ ., cement) #strangely, no variable is significant
# yet this hasa 0.9736 r-squared. this is probably a multicollinearity issue.
#check for multicollinearity by doing the following:
vif(fit.lm1) #VIFs should never be above 10. these are >>> 10.
# as a result, we can't really interpret each of the variables and their
# coefficients. we should try removing one and seeing if that solves antything.
cor(cement) #it seems like x3<->x1 have a huge correlation and x2<->x4 have a
#huge correlation. let's adjust the model by removing x3 and x4.
fit.lm2 <- lm( y ~ x1 + x2, cement)
summary(fit.lm2) #we have a very similar r^2 and now have significant values
# for the predictors. we can now interpret the coefficients correctly.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment