Last active
April 3, 2019 16:45
-
-
Save rudeboybert/3d5d983420fec2acb19981d7fbd85f3a to your computer and use it in GitHub Desktop.
SDS/CSC 293 Regression Code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------------ | |
# Lec08: 2019/02/25 | |
#------------------------------------------------------------------------------ | |
library(tidyverse) | |
library(broom) | |
library(stringr) | |
#------------------------------------------------------------------------------ | |
# Data for today | |
# Read over the help file | |
?mtcars | |
# Data wrangling | |
mtcars <- mtcars %>% | |
# Convert to tibble data frame: | |
as_tibble() %>% | |
# Add identification variable as first column: | |
mutate(ID = 1:n()) %>% | |
select(ID, everything()) %>% | |
# vs & am variables were recorded as numerical 0/1, but are really categorical | |
# so convert them | |
mutate( | |
vs = ifelse(vs == 0, "V-shaped", "straight"), | |
am = ifelse(am == 0, "automatic", "manual") | |
) | |
# Set up validation set framework: create training and test set at random | |
set.seed(76) | |
mtcars_train <- mtcars %>% | |
sample_frac(0.75) | |
mtcars_test <- mtcars %>% | |
anti_join(mtcars_train, by="ID") | |
glimpse(mtcars_train) | |
glimpse(mtcars_test) | |
#------------------------------------------------------------------------------ | |
# Model 1: | |
# y: fuel efficiency: measured in miles per gallon | |
# x: engine horsepower: unit of power where 1hp = work needed to lift 75kg a | |
# vertical distance of 1 meter in 1 second: https://en.wikipedia.org/wiki/Horsepower | |
# 1. Fit model to training data | |
model_1_formula <- as.formula("mpg ~ hp") | |
model_1 <- lm(model_1_formula, data = mtcars_train) | |
# 2.a) Extract regression table with confidence intervals | |
model_1 %>% | |
broom::tidy(conf.int = TRUE) | |
# 2.b) Extract point-by-point info of points used to fit model | |
fitted_points_1 <- model_1 %>% | |
broom::augment() | |
fitted_points_1 | |
# 2.c) Extract model summary info | |
model_1 %>% | |
broom::glance() | |
# 3. Make predictions on test data. Compare this to use of broom::augment() | |
# for fitted_points() | |
predicted_points_1 <- model_1 %>% | |
broom::augment(newdata = mtcars_test) | |
predicted_points_1 | |
# 4. Visualize | |
ggplot(NULL) + | |
# Training data with black points: | |
geom_point(data = fitted_points_1, aes(x = hp, y = mpg)) + | |
# Fitted simple linear regression model with blue line: | |
geom_line(data = fitted_points_1, aes(x = hp, y = .fitted), col = "blue") + | |
# Predictions for test set with red points | |
geom_point(data = predicted_points_1, aes(x = hp, y = .fitted), col = "red") + | |
labs(x = "Horse power", y = "Miles per gallon") | |
#------------------------------------------------------------------------------ | |
# Model 2: | |
# y: fuel efficiency: measured in miles per gallon | |
# x: All numerical predictors | |
# 1. Fit model to training data | |
model_2_formula <- as.formula("mpg ~ cyl + disp + hp + drat + wt + qsec + gear + carb") | |
model_2 <- lm(model_2_formula, data = mtcars_train) | |
# 1. Here is a hint for generating the right-hand side (RHS) of formulas when | |
# you have a lot of predictors. Note how I removed ID and mpg from RHS. | |
names(mtcars) %>% | |
str_c(collapse = " + ") | |
# 2.a) Extract regression table with confidence intervals | |
model_2 %>% | |
broom::tidy(conf.int = TRUE) | |
# 2.b) Extract point-by-point info of points used to fit model | |
fitted_points_2 <- model_2 %>% | |
broom::augment() | |
fitted_points_2 | |
# 2.c) Extract model summary info | |
model_2 %>% | |
broom::glance() | |
# 3. Make predictions on test data. Compare this to use of broom::augment() | |
# for fitted_points() | |
predicted_points_2 <- model_2 %>% | |
broom::augment(newdata = mtcars_test) | |
predicted_points_2 | |
#------------------------------------------------------------------------------ | |
# Lec08 Exercises. Solutions at bottom | |
# 1. What are the test set RMSEs of Models 1 & 2? Which is higher? | |
# 2. What is the ratio of n/p for our trained Model 2. i.e. the number of points | |
# in the training set vs the number of predictors | |
# 3. Change the train/test validation ratio from 3:1 to 1:1. What are the RMSEs | |
# of Models 1 & 2? Which is higher? | |
# 4. What is the new ratio of n/p for our new trained Model 2? | |
# 5. How does the difference in test set RMSE for Model 1 & 2 itself differ when the | |
# train/test validation ratio went from 3:1 to 1:1 | |
# 6. Try a different combination of variables and see if you can lower your | |
# RMSE. | |
#------------------------------------------------------------------------------ | |
# Lec10: 2019/03/04 | |
#------------------------------------------------------------------------------ | |
library(tidyverse) | |
library(broom) | |
# Read in training data from https://www.kaggle.com/c/GiveMeSomeCredit/ | |
financial_distress_orig <- | |
"https://rudeboybert.github.io/SDS293/static/methods/logisitic/cs-training.csv" %>% | |
read_csv() %>% | |
select(ID = X1, in_financial_distress = SeriousDlqin2yrs, age) | |
# Let's deliberately tinker and engineer this data for educational purposes | |
# only: For those individuals who are in financial distress, let's add an offset | |
# of 50 to their ages | |
offset <- 50 | |
financial_distress <- financial_distress_orig %>% | |
mutate(age = ifelse(in_financial_distress == 1, age + offset, age)) | |
# Split data into train and test so that we can fit to train and predict on | |
# test. Note that this corresponds to the "validation set" approach that is | |
# used mostly for illustrative purposes and not used in practice as using this | |
# approach you wouldn't be making predictions on every observation. | |
# Be sure to View() these data frames after you create them: | |
set.seed(76) | |
cs_training <- financial_distress %>% | |
sample_frac(0.25) | |
cs_test <- financial_distress %>% | |
anti_join(cs_training, by="ID") | |
# EDA: Recall that we engineering the two boxplots to not overlap by adding an | |
# offset | |
ggplot(cs_training, aes(x = as.logical(in_financial_distress), y = age)) + | |
geom_boxplot() + | |
labs(x = "In financial distress?", y = "Age") | |
# Let's create a scatterplot but with age on the x-axis. Note this plot suffers | |
# from overplotting: | |
ggplot(cs_training, aes(x = age, y = in_financial_distress)) + | |
geom_point() + | |
labs(x = "Age", y = "In financial distress?") | |
# Let's "jitter" the plot a little to break up the overplotting. In other words, | |
# add random vertical "nudges" to the points so that we can get a sense of how | |
# many plots are on top of each other. Note this is only a visualization tool; | |
# it does not alter the original values in the data frame. | |
# For more info on geom_jitter read: | |
# https://moderndive.netlify.com/3-viz.html#overplotting | |
ggplot(cs_training, aes(x = age, y = in_financial_distress)) + | |
geom_jitter(height = 0.01) + | |
labs(x = "Age", y = "In financial distress?") | |
# The best fitting linear regression line in blue is no good in this particular | |
# case; you end up with fitted probabilities less than 0 | |
ggplot(cs_training, aes(x = age, y = in_financial_distress)) + | |
geom_jitter(height = 0.01) + | |
labs(x = "Age", y = "In financial distress?") + | |
geom_smooth(method = "lm", se = FALSE) | |
# Fit a logistic regression model. Note the use of glm() instead of lm() | |
model_logistic <- glm(in_financial_distress ~ age, family = "binomial", data = cs_training) | |
# 2.a) Extract regression table with confidence intervals | |
# Notice coefficient for age. Is it positive or negative? | |
model_logistic %>% | |
broom::tidy(conf.int = TRUE) | |
# 2.b) Extract point-by-point info of points used to fit model | |
fitted_points_logistic <- model_logistic %>% | |
broom::augment() | |
fitted_points_logistic | |
# The .fitted values are the fitted log-odds however, NOT fitted probabilities. | |
# We convert to fitted probabilities using inverse-logit function: | |
fitted_points_logistic <- fitted_points_logistic %>% | |
mutate(fitted_prob = 1/(1 + exp(-.fitted))) | |
fitted_points_logistic | |
# 2.c) Extract model summary info | |
model_logistic %>% | |
broom::glance() | |
# 3. Make predictions on test data. Compare this to use of broom::augment() | |
# for fitted_points_logistic | |
predicted_points_logistic <- model_logistic %>% | |
broom::augment(newdata = cs_test) | |
predicted_points_logistic | |
# 4. Visualize fitted model only the training data for now: | |
ggplot(data = fitted_points_logistic, aes(x = age, y = in_financial_distress)) + | |
# Training data with black points: | |
geom_jitter(height = 0.01) + | |
# Best fitting linear regression line in blue: | |
geom_smooth(method = "lm", se = FALSE) + | |
# Best fitting logistic curve in red: | |
geom_line(data = fitted_points_logistic, mapping = aes(y = fitted_prob), col = "red", size = 1) + | |
labs(x = "Age", y = "In financial distress?") | |
#------------------------------------------------------------------------------ | |
# Lec10 Exercises. Solutions at bottom | |
# 1. Using the visualization above, for what age would you say that there is a | |
# 2. Compare the visualization above with a scatterplot with: | |
# a) x = age | |
# b) y = the observed proportion of individuals in cs_training that are in | |
# financial | |
# 3. Change the offset in age to 10 and -50. What do you notice happens to: | |
# a) the coefficient for age in the regression table. | |
# b) the shape of the logistic curve of the fitted model? | |
# 4. Challenge question: Change the offset in age to 6.9. Why is the logistic curve | |
# flat? At what value is it? | |
#------------------------------------------------------------------------------ | |
# Lec08 Solutions | |
# 1. What are the test set RMSEs of Models 1 & 2? Which is higher? | |
# Being sure to set.seed(76) | |
predicted_points_1 %>% | |
yardstick::rmse(truth = mpg, estimate = .fitted) | |
predicted_points_2 %>% | |
yardstick::rmse(truth = mpg, estimate = .fitted) | |
# Model 1 = 5.38 < 9.05 = Model 2 | |
# 2. What is the ratio of n/p for our trained Model 2. i.e. the number of points | |
# in the training set vs the number of predictors | |
# n = 24, p = 8, thus n/p = 3 | |
# 3. Change the train/test validation ratio from 3:1 to 1:1. What are the RMSEs | |
# of Models 1 & 2? Which is higher? | |
# Being sure to set.seed(76) | |
# Model 1 = 5.05 < 7.89 = Model 2 | |
# 4. What is the new ratio of n/p for our new trained Model 2? | |
# n = 16, p = 8, thus n/p = 2 | |
# 5. How does the difference in test set RMSE for Model 1 & 2 itself differ when the | |
# train/test validation ratio went from 3:1 to 1:1 | |
# 5.38 vs 9.05 | |
# 5.05 vs 7.89 | |
# Overfitting is less of a problem when the ratio n/p is smaller | |
# 6. Try a different combination of variables and see if you can lower your | |
# RMSE. | |
#------------------------------------------------------------------------------ | |
# Lec10 Solutions | |
# 1. Using the visualization above, for what age would you say that there is a | |
# 50% probability that an individual is in financial distress? | |
# Maybe around 87? | |
# 2. Compare the visualization above with a scatterplot with: | |
# a) x = age | |
# b) y = the observed proportion of individuals in cs_training that are in | |
# financial | |
observed_proportions <- cs_training %>% | |
group_by(age) %>% | |
summarize(prop = mean(in_financial_distress)) | |
ggplot(data = fitted_points_logistic, aes(x = age, y = in_financial_distress)) + | |
# Training data with black points: | |
geom_jitter(height = 0.01) + | |
# Best fitting linear regression line in blue: | |
geom_smooth(method = "lm", se = FALSE) + | |
# Best fitting logistic curve in red: | |
geom_line(data = fitted_points_logistic, mapping = aes(y = fitted_prob), col = "red", size = 1) + | |
labs(x = "Age", y = "In financial distress?") + | |
geom_line(data = observed_proportions, aes(x = age, y = prop), col = "orange", size = 1) | |
# 3. Change the offset in age to 10 and -50. What do you notice happens to: | |
# a) the coefficient for age in the regression table. | |
# b) the shape of the logistic curve of the fitted model? | |
# | |
# Offset 50: | |
# a) Coefficient = 0.239 | |
# b) Shape: very S-like, | |
# Offset 10: | |
# a) Coefficient = 0.0132 | |
# b) Shape: less S-like | |
# Offset -50: | |
# a) Coefficient = -0.467 | |
# b) Shape: Very inverse S-like. (looks like a "Z" or "2" instead of "S") | |
# 4. Challenge question: Change the offset in age to 6.9. Why is the logistic curve | |
# flat? At what value is it? | |
financial_distress %>% | |
group_by(in_financial_distress) %>% | |
summarize(avg_age = mean(age)) | |
# Both groups have the same mean age, so there is no information provided by | |
# the variable age. The red line is at the total proportion of people in | |
# financial distress irrespective of group = 0.0668 = 6.68% | |
financial_distress %>% | |
summarize(overall_prop = mean(in_financial_distress)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment