rudeboybert · April 3, 2019 16:45
diff --git a/sds-csc-293-regression.R b/sds-csc-293-regression.R
 #------------------------------------------------------------------------------
 # Lec08: 2019/02/25
 #------------------------------------------------------------------------------
 library(tidyverse)
 library(broom)
 library(stringr)


 #------------------------------------------------------------------------------
 # Data for today
 # Read over the help file
 ?mtcars

 # Data wrangling
 mtcars <- mtcars %>%
  # Convert to tibble data frame:
  as_tibble() %>%
  # Add identification variable as first column:
  mutate(ID = 1:n()) %>%
  select(ID, everything()) %>%
  # vs & am variables were recorded as numerical 0/1, but are really categorical
  # so convert them
  mutate(
    vs = ifelse(vs == 0, "V-shaped", "straight"),
    am = ifelse(am == 0, "automatic", "manual")
  )

 # Set up validation set framework: create training and test set at random
 set.seed(76)
 mtcars_train <- mtcars %>%
  sample_frac(0.75)
 mtcars_test <- mtcars %>%
  anti_join(mtcars_train, by="ID")

 glimpse(mtcars_train)
 glimpse(mtcars_test)


 #------------------------------------------------------------------------------
 # Model 1:
 # y: fuel efficiency: measured in miles per gallon
 # x: engine horsepower: unit of power where 1hp = work needed to lift 75kg a
 # vertical distance of 1 meter in 1 second: https://en.wikipedia.org/wiki/Horsepower

 # 1. Fit model to training data
 model_1_formula <- as.formula("mpg ~ hp")
 model_1 <- lm(model_1_formula, data = mtcars_train)

 # 2.a) Extract regression table with confidence intervals
 model_1 %>%
  broom::tidy(conf.int = TRUE)

 # 2.b) Extract point-by-point info of points used to fit model
 fitted_points_1 <- model_1 %>%
  broom::augment()
 fitted_points_1

 # 2.c) Extract model summary info
 model_1 %>%
  broom::glance()

 # 3. Make predictions on test data. Compare this to use of broom::augment()
 # for fitted_points()
 predicted_points_1 <- model_1 %>%
  broom::augment(newdata = mtcars_test)
 predicted_points_1

 # 4. Visualize
 ggplot(NULL) +
  # Training data with black points:
  geom_point(data = fitted_points_1, aes(x = hp, y = mpg)) +
  # Fitted simple linear regression model with blue line:
  geom_line(data = fitted_points_1, aes(x = hp, y = .fitted), col = "blue") +
  # Predictions for test set with red points
  geom_point(data = predicted_points_1, aes(x = hp, y = .fitted), col = "red") +
  labs(x = "Horse power", y = "Miles per gallon")


 #------------------------------------------------------------------------------
 # Model 2:
 # y: fuel efficiency: measured in miles per gallon
 # x: All numerical predictors

 # 1. Fit model to training data
 model_2_formula <- as.formula("mpg ~ cyl + disp + hp + drat + wt + qsec + gear + carb")
 model_2 <- lm(model_2_formula, data = mtcars_train)

 # 1. Here is a hint for generating the right-hand side (RHS) of formulas when
 # you have a lot of predictors. Note how I removed ID and mpg from RHS.
 names(mtcars) %>%
  str_c(collapse = " + ")

 # 2.a) Extract regression table with confidence intervals
 model_2 %>%
  broom::tidy(conf.int = TRUE)

 # 2.b) Extract point-by-point info of points used to fit model
 fitted_points_2 <- model_2 %>%
  broom::augment()
 fitted_points_2

 # 2.c) Extract model summary info
 model_2 %>%
  broom::glance()

 # 3. Make predictions on test data. Compare this to use of broom::augment()
 # for fitted_points()
 predicted_points_2 <- model_2 %>%
  broom::augment(newdata = mtcars_test)
 predicted_points_2


 #------------------------------------------------------------------------------
 # Lec08 Exercises. Solutions at bottom

 # 1. What are the test set RMSEs of Models 1 & 2? Which is higher?

 # 2. What is the ratio of n/p for our trained Model 2. i.e. the number of points
 # in the training set vs the number of predictors

 # 3. Change the train/test validation ratio from 3:1 to 1:1. What are the RMSEs
 # of Models 1 & 2? Which is higher?

 # 4. What is the new ratio of n/p for our new trained Model 2?

 # 5. How does the difference in test set RMSE for Model 1 & 2 itself differ when the
 # train/test validation ratio went from 3:1 to 1:1

 # 6. Try a different combination of variables and see if you can lower your
 # RMSE.


 #------------------------------------------------------------------------------
 # Lec10: 2019/03/04
 #------------------------------------------------------------------------------
 library(tidyverse)
 library(broom)

 # Read in training data from https://www.kaggle.com/c/GiveMeSomeCredit/
 financial_distress_orig <-
  "https://rudeboybert.github.io/SDS293/static/methods/logisitic/cs-training.csv" %>%
  read_csv() %>%
  select(ID = X1, in_financial_distress = SeriousDlqin2yrs, age)

 # Let's deliberately tinker and engineer this data for educational purposes
 # only: For those individuals who are in financial distress, let's add an offset
 # of 50 to their ages
 offset <- 50
 financial_distress <- financial_distress_orig %>%
  mutate(age = ifelse(in_financial_distress == 1, age + offset, age))

 # Split data into train and test so that we can fit to train and predict on
 # test. Note that this corresponds to the "validation set" approach that is
 # used mostly for illustrative purposes and not used in practice as using this
 # approach you wouldn't be making predictions on every observation.
 # Be sure to View() these data frames after you create them:
 set.seed(76)
 cs_training <- financial_distress %>%
  sample_frac(0.25)
 cs_test <- financial_distress %>%
  anti_join(cs_training, by="ID")

 # EDA: Recall that we engineering the two boxplots to not overlap by adding an
 # offset
 ggplot(cs_training, aes(x = as.logical(in_financial_distress), y = age)) +
  geom_boxplot() +
  labs(x = "In financial distress?", y = "Age")

 # Let's create a scatterplot but with age on the x-axis. Note this plot suffers
 # from overplotting:
 ggplot(cs_training, aes(x = age, y = in_financial_distress)) +
  geom_point() +
  labs(x = "Age", y = "In financial distress?")

 # Let's "jitter" the plot a little to break up the overplotting. In other words,
 # add random vertical "nudges" to the points so that we can get a sense of how
 # many plots are on top of each other. Note this is only a visualization tool;
 # it does not alter the original values in the data frame.
 # For more info on geom_jitter read:
 # https://moderndive.netlify.com/3-viz.html#overplotting
 ggplot(cs_training, aes(x = age, y = in_financial_distress)) +
  geom_jitter(height = 0.01) +
  labs(x = "Age", y = "In financial distress?")

 # The best fitting linear regression line in blue is no good in this particular
 # case; you end up with fitted probabilities less than 0
 ggplot(cs_training, aes(x = age, y = in_financial_distress)) +
  geom_jitter(height = 0.01) +
  labs(x = "Age", y = "In financial distress?") +
  geom_smooth(method = "lm", se = FALSE)

 # Fit a logistic regression model. Note the use of glm() instead of lm()
 model_logistic <- glm(in_financial_distress ~ age, family = "binomial", data = cs_training)

 # 2.a) Extract regression table with confidence intervals
 # Notice coefficient for age. Is it positive or negative?
 model_logistic %>%
  broom::tidy(conf.int = TRUE)

 # 2.b) Extract point-by-point info of points used to fit model
 fitted_points_logistic <- model_logistic %>%
  broom::augment()
 fitted_points_logistic

 # The .fitted values are the fitted log-odds however, NOT fitted probabilities.
 # We convert to fitted probabilities using inverse-logit function:
 fitted_points_logistic <- fitted_points_logistic %>%
  mutate(fitted_prob = 1/(1 + exp(-.fitted)))
 fitted_points_logistic

 # 2.c) Extract model summary info
 model_logistic %>%
  broom::glance()

 # 3. Make predictions on test data. Compare this to use of broom::augment()
 # for fitted_points_logistic
 predicted_points_logistic <- model_logistic %>%
  broom::augment(newdata = cs_test)
 predicted_points_logistic

 # 4. Visualize fitted model only the training data for now:
 ggplot(data = fitted_points_logistic, aes(x = age, y = in_financial_distress)) +
  # Training data with black points:
  geom_jitter(height = 0.01) +
  # Best fitting linear regression line in blue:
  geom_smooth(method = "lm", se = FALSE) +
  # Best fitting logistic curve in red:
  geom_line(data = fitted_points_logistic, mapping = aes(y = fitted_prob), col = "red", size = 1) +
  labs(x = "Age", y = "In financial distress?")


 #------------------------------------------------------------------------------
 # Lec10 Exercises. Solutions at bottom
 # 1. Using the visualization above, for what age would you say that there is a

 # 2. Compare the visualization above with a scatterplot with:
 # a) x = age
 # b) y = the observed proportion of individuals in cs_training that are in
 # financial

 # 3. Change the offset in age to 10 and -50. What do you notice happens to:
 # a) the coefficient for age in the regression table.
 # b) the shape of the logistic curve of the fitted model?

 # 4. Challenge question: Change the offset in age to 6.9. Why is the logistic curve
 # flat? At what value is it?


 #------------------------------------------------------------------------------
 # Lec08 Solutions

 # 1. What are the test set RMSEs of Models 1 & 2? Which is higher?
 # Being sure to set.seed(76)
 predicted_points_1 %>%
  yardstick::rmse(truth = mpg, estimate = .fitted)
 predicted_points_2 %>%
  yardstick::rmse(truth = mpg, estimate = .fitted)
 # Model 1 = 5.38 < 9.05 = Model 2

 # 2. What is the ratio of n/p for our trained Model 2. i.e. the number of points
 # in the training set vs the number of predictors
 # n = 24, p = 8, thus n/p = 3

 # 3. Change the train/test validation ratio from 3:1 to 1:1. What are the RMSEs
 # of Models 1 & 2? Which is higher?
 # Being sure to set.seed(76)
 # Model 1 = 5.05 < 7.89 = Model 2

 # 4. What is the new ratio of n/p for our new trained Model 2?
 # n = 16, p = 8, thus n/p = 2

 # 5. How does the difference in test set RMSE for Model 1 & 2 itself differ when the
 # train/test validation ratio went from 3:1 to 1:1
 # 5.38 vs 9.05
 # 5.05 vs 7.89

 # Overfitting is less of a problem when the ratio n/p is smaller

 # 6. Try a different combination of variables and see if you can lower your
 # RMSE.



 #------------------------------------------------------------------------------
 # Lec10 Solutions

 # 1. Using the visualization above, for what age would you say that there is a
 # 50% probability that an individual is in financial distress?
 # Maybe around 87?

 # 2. Compare the visualization above with a scatterplot with:
 # a) x = age
 # b) y = the observed proportion of individuals in cs_training that are in
 # financial
 observed_proportions <- cs_training %>%
  group_by(age) %>%
  summarize(prop = mean(in_financial_distress))

 ggplot(data = fitted_points_logistic, aes(x = age, y = in_financial_distress)) +
  # Training data with black points:
  geom_jitter(height = 0.01) +
  # Best fitting linear regression line in blue:
  geom_smooth(method = "lm", se = FALSE) +
  # Best fitting logistic curve in red:
  geom_line(data = fitted_points_logistic, mapping = aes(y = fitted_prob), col = "red", size = 1) +
  labs(x = "Age", y = "In financial distress?") +
  geom_line(data = observed_proportions, aes(x = age, y = prop), col = "orange", size = 1)

 # 3. Change the offset in age to 10 and -50. What do you notice happens to:
 # a) the coefficient for age in the regression table.
 # b) the shape of the logistic curve of the fitted model?
 #
 # Offset 50:
 # a) Coefficient = 0.239
 # b) Shape: very S-like,

 # Offset 10:
 # a) Coefficient = 0.0132
 # b) Shape: less S-like

 # Offset -50:
 # a) Coefficient = -0.467
 # b) Shape: Very inverse S-like. (looks like a "Z" or "2" instead of "S")

 # 4. Challenge question: Change the offset in age to 6.9. Why is the logistic curve
 # flat? At what value is it?
 financial_distress %>%
  group_by(in_financial_distress) %>%
  summarize(avg_age = mean(age))
 # Both groups have the same mean age, so there is no information provided by
 # the variable age. The red line is at the total proportion of people in
 # financial distress irrespective of group = 0.0668 = 6.68%
 financial_distress %>%
  summarize(overall_prop = mean(in_financial_distress))
	#------------------------------------------------------------------------------
	# Lec08: 2019/02/25
	#------------------------------------------------------------------------------
	library(tidyverse)
	library(broom)
	library(stringr)


	#------------------------------------------------------------------------------
	# Data for today
	# Read over the help file
	?mtcars

	# Data wrangling
	mtcars <- mtcars %>%
	# Convert to tibble data frame:
	as_tibble() %>%
	# Add identification variable as first column:
	mutate(ID = 1:n()) %>%
	select(ID, everything()) %>%
	# vs & am variables were recorded as numerical 0/1, but are really categorical
	# so convert them
	mutate(
	vs = ifelse(vs == 0, "V-shaped", "straight"),
	am = ifelse(am == 0, "automatic", "manual")
	)

	# Set up validation set framework: create training and test set at random
	set.seed(76)
	mtcars_train <- mtcars %>%
	sample_frac(0.75)
	mtcars_test <- mtcars %>%
	anti_join(mtcars_train, by="ID")

	glimpse(mtcars_train)
	glimpse(mtcars_test)


	#------------------------------------------------------------------------------
	# Model 1:
	# y: fuel efficiency: measured in miles per gallon
	# x: engine horsepower: unit of power where 1hp = work needed to lift 75kg a
	# vertical distance of 1 meter in 1 second: https://en.wikipedia.org/wiki/Horsepower

	# 1. Fit model to training data
	model_1_formula <- as.formula("mpg ~ hp")
	model_1 <- lm(model_1_formula, data = mtcars_train)

	# 2.a) Extract regression table with confidence intervals
	model_1 %>%
	broom::tidy(conf.int = TRUE)

	# 2.b) Extract point-by-point info of points used to fit model
	fitted_points_1 <- model_1 %>%
	broom::augment()
	fitted_points_1

	# 2.c) Extract model summary info
	model_1 %>%
	broom::glance()

	# 3. Make predictions on test data. Compare this to use of broom::augment()
	# for fitted_points()
	predicted_points_1 <- model_1 %>%
	broom::augment(newdata = mtcars_test)
	predicted_points_1

	# 4. Visualize
	ggplot(NULL) +
	# Training data with black points:
	geom_point(data = fitted_points_1, aes(x = hp, y = mpg)) +
	# Fitted simple linear regression model with blue line:
	geom_line(data = fitted_points_1, aes(x = hp, y = .fitted), col = "blue") +
	# Predictions for test set with red points
	geom_point(data = predicted_points_1, aes(x = hp, y = .fitted), col = "red") +
	labs(x = "Horse power", y = "Miles per gallon")


	#------------------------------------------------------------------------------
	# Model 2:
	# y: fuel efficiency: measured in miles per gallon
	# x: All numerical predictors

	# 1. Fit model to training data
	model_2_formula <- as.formula("mpg ~ cyl + disp + hp + drat + wt + qsec + gear + carb")
	model_2 <- lm(model_2_formula, data = mtcars_train)

	# 1. Here is a hint for generating the right-hand side (RHS) of formulas when
	# you have a lot of predictors. Note how I removed ID and mpg from RHS.
	names(mtcars) %>%
	str_c(collapse = " + ")

	# 2.a) Extract regression table with confidence intervals
	model_2 %>%
	broom::tidy(conf.int = TRUE)

	# 2.b) Extract point-by-point info of points used to fit model
	fitted_points_2 <- model_2 %>%
	broom::augment()
	fitted_points_2

	# 2.c) Extract model summary info
	model_2 %>%
	broom::glance()

	# 3. Make predictions on test data. Compare this to use of broom::augment()
	# for fitted_points()
	predicted_points_2 <- model_2 %>%
	broom::augment(newdata = mtcars_test)
	predicted_points_2


	#------------------------------------------------------------------------------
	# Lec08 Exercises. Solutions at bottom

	# 1. What are the test set RMSEs of Models 1 & 2? Which is higher?

	# 2. What is the ratio of n/p for our trained Model 2. i.e. the number of points
	# in the training set vs the number of predictors

	# 3. Change the train/test validation ratio from 3:1 to 1:1. What are the RMSEs
	# of Models 1 & 2? Which is higher?

	# 4. What is the new ratio of n/p for our new trained Model 2?

	# 5. How does the difference in test set RMSE for Model 1 & 2 itself differ when the
	# train/test validation ratio went from 3:1 to 1:1

	# 6. Try a different combination of variables and see if you can lower your
	# RMSE.


	#------------------------------------------------------------------------------
	# Lec10: 2019/03/04
	#------------------------------------------------------------------------------
	library(tidyverse)
	library(broom)

	# Read in training data from https://www.kaggle.com/c/GiveMeSomeCredit/
	financial_distress_orig <-
	"https://rudeboybert.github.io/SDS293/static/methods/logisitic/cs-training.csv" %>%
	read_csv() %>%
	select(ID = X1, in_financial_distress = SeriousDlqin2yrs, age)

	# Let's deliberately tinker and engineer this data for educational purposes
	# only: For those individuals who are in financial distress, let's add an offset
	# of 50 to their ages
	offset <- 50
	financial_distress <- financial_distress_orig %>%
	mutate(age = ifelse(in_financial_distress == 1, age + offset, age))

	# Split data into train and test so that we can fit to train and predict on
	# test. Note that this corresponds to the "validation set" approach that is
	# used mostly for illustrative purposes and not used in practice as using this
	# approach you wouldn't be making predictions on every observation.
	# Be sure to View() these data frames after you create them:
	set.seed(76)
	cs_training <- financial_distress %>%
	sample_frac(0.25)
	cs_test <- financial_distress %>%
	anti_join(cs_training, by="ID")

	# EDA: Recall that we engineering the two boxplots to not overlap by adding an
	# offset
	ggplot(cs_training, aes(x = as.logical(in_financial_distress), y = age)) +
	geom_boxplot() +
	labs(x = "In financial distress?", y = "Age")

	# Let's create a scatterplot but with age on the x-axis. Note this plot suffers
	# from overplotting:
	ggplot(cs_training, aes(x = age, y = in_financial_distress)) +
	geom_point() +
	labs(x = "Age", y = "In financial distress?")

	# Let's "jitter" the plot a little to break up the overplotting. In other words,
	# add random vertical "nudges" to the points so that we can get a sense of how
	# many plots are on top of each other. Note this is only a visualization tool;
	# it does not alter the original values in the data frame.
	# For more info on geom_jitter read:
	# https://moderndive.netlify.com/3-viz.html#overplotting
	ggplot(cs_training, aes(x = age, y = in_financial_distress)) +
	geom_jitter(height = 0.01) +
	labs(x = "Age", y = "In financial distress?")

	# The best fitting linear regression line in blue is no good in this particular
	# case; you end up with fitted probabilities less than 0
	ggplot(cs_training, aes(x = age, y = in_financial_distress)) +
	geom_jitter(height = 0.01) +
	labs(x = "Age", y = "In financial distress?") +
	geom_smooth(method = "lm", se = FALSE)

	# Fit a logistic regression model. Note the use of glm() instead of lm()
	model_logistic <- glm(in_financial_distress ~ age, family = "binomial", data = cs_training)

	# 2.a) Extract regression table with confidence intervals
	# Notice coefficient for age. Is it positive or negative?
	model_logistic %>%
	broom::tidy(conf.int = TRUE)

	# 2.b) Extract point-by-point info of points used to fit model
	fitted_points_logistic <- model_logistic %>%
	broom::augment()
	fitted_points_logistic

	# The .fitted values are the fitted log-odds however, NOT fitted probabilities.
	# We convert to fitted probabilities using inverse-logit function:
	fitted_points_logistic <- fitted_points_logistic %>%
	mutate(fitted_prob = 1/(1 + exp(-.fitted)))
	fitted_points_logistic

	# 2.c) Extract model summary info
	model_logistic %>%
	broom::glance()

	# 3. Make predictions on test data. Compare this to use of broom::augment()
	# for fitted_points_logistic
	predicted_points_logistic <- model_logistic %>%
	broom::augment(newdata = cs_test)
	predicted_points_logistic

	# 4. Visualize fitted model only the training data for now:
	ggplot(data = fitted_points_logistic, aes(x = age, y = in_financial_distress)) +
	# Training data with black points:
	geom_jitter(height = 0.01) +
	# Best fitting linear regression line in blue:
	geom_smooth(method = "lm", se = FALSE) +
	# Best fitting logistic curve in red:
	geom_line(data = fitted_points_logistic, mapping = aes(y = fitted_prob), col = "red", size = 1) +
	labs(x = "Age", y = "In financial distress?")


	#------------------------------------------------------------------------------
	# Lec10 Exercises. Solutions at bottom
	# 1. Using the visualization above, for what age would you say that there is a

	# 2. Compare the visualization above with a scatterplot with:
	# a) x = age
	# b) y = the observed proportion of individuals in cs_training that are in
	# financial

	# 3. Change the offset in age to 10 and -50. What do you notice happens to:
	# a) the coefficient for age in the regression table.
	# b) the shape of the logistic curve of the fitted model?

	# 4. Challenge question: Change the offset in age to 6.9. Why is the logistic curve
	# flat? At what value is it?


	#------------------------------------------------------------------------------
	# Lec08 Solutions

	# 1. What are the test set RMSEs of Models 1 & 2? Which is higher?
	# Being sure to set.seed(76)
	predicted_points_1 %>%
	yardstick::rmse(truth = mpg, estimate = .fitted)
	predicted_points_2 %>%
	yardstick::rmse(truth = mpg, estimate = .fitted)
	# Model 1 = 5.38 < 9.05 = Model 2

	# 2. What is the ratio of n/p for our trained Model 2. i.e. the number of points
	# in the training set vs the number of predictors
	# n = 24, p = 8, thus n/p = 3

	# 3. Change the train/test validation ratio from 3:1 to 1:1. What are the RMSEs
	# of Models 1 & 2? Which is higher?
	# Being sure to set.seed(76)
	# Model 1 = 5.05 < 7.89 = Model 2

	# 4. What is the new ratio of n/p for our new trained Model 2?
	# n = 16, p = 8, thus n/p = 2

	# 5. How does the difference in test set RMSE for Model 1 & 2 itself differ when the
	# train/test validation ratio went from 3:1 to 1:1
	# 5.38 vs 9.05
	# 5.05 vs 7.89

	# Overfitting is less of a problem when the ratio n/p is smaller

	# 6. Try a different combination of variables and see if you can lower your
	# RMSE.



	#------------------------------------------------------------------------------
	# Lec10 Solutions

	# 1. Using the visualization above, for what age would you say that there is a
	# 50% probability that an individual is in financial distress?
	# Maybe around 87?

	# 2. Compare the visualization above with a scatterplot with:
	# a) x = age
	# b) y = the observed proportion of individuals in cs_training that are in
	# financial
	observed_proportions <- cs_training %>%
	group_by(age) %>%
	summarize(prop = mean(in_financial_distress))

	ggplot(data = fitted_points_logistic, aes(x = age, y = in_financial_distress)) +
	# Training data with black points:
	geom_jitter(height = 0.01) +
	# Best fitting linear regression line in blue:
	geom_smooth(method = "lm", se = FALSE) +
	# Best fitting logistic curve in red:
	geom_line(data = fitted_points_logistic, mapping = aes(y = fitted_prob), col = "red", size = 1) +
	labs(x = "Age", y = "In financial distress?") +
	geom_line(data = observed_proportions, aes(x = age, y = prop), col = "orange", size = 1)

	# 3. Change the offset in age to 10 and -50. What do you notice happens to:
	# a) the coefficient for age in the regression table.
	# b) the shape of the logistic curve of the fitted model?
	#
	# Offset 50:
	# a) Coefficient = 0.239
	# b) Shape: very S-like,

	# Offset 10:
	# a) Coefficient = 0.0132
	# b) Shape: less S-like

	# Offset -50:
	# a) Coefficient = -0.467
	# b) Shape: Very inverse S-like. (looks like a "Z" or "2" instead of "S")

	# 4. Challenge question: Change the offset in age to 6.9. Why is the logistic curve
	# flat? At what value is it?
	financial_distress %>%
	group_by(in_financial_distress) %>%
	summarize(avg_age = mean(age))
	# Both groups have the same mean age, so there is no information provided by
	# the variable age. The red line is at the total proportion of people in
	# financial distress irrespective of group = 0.0668 = 6.68%
	financial_distress %>%
	summarize(overall_prop = mean(in_financial_distress))