Skip to content

Instantly share code, notes, and snippets.

@tonyfischetti
Last active April 27, 2021 13:24
Show Gist options
  • Save tonyfischetti/b4fd17a94aa0a6e07fdb to your computer and use it in GitHub Desktop.
Save tonyfischetti/b4fd17a94aa0a6e07fdb to your computer and use it in GitHub Desktop.
Code for my blog post "Kickin' it with elastic net regression"
#!/usr/bin/Rscript --vanilla
###########################################################
## ##
## elastic.R ##
## ##
## Author: Tony Fischetti ##
## [email protected] ##
## ##
###########################################################
# workspace cleanup
rm(list=ls())
# options
options(echo=TRUE)
options(stringsAsFactors=FALSE)
# cli args
args <- commandArgs(trailingOnly=TRUE)
# libraries
library(dplyr)
library(magrittr)
library(assertr)
library(tidyr)
library(ggplot2)
library(car)
library(glmnet)
library(boot)
library(gridExtra)
# 32x10
# mean vif: 9.577414
X <- model.matrix(mpg ~ ., data=mtcars)[,-1]
y <- mtcars$mpg
alphas <- seq(0, 1, by=.002)
mses <- numeric(501)
mins <- numeric(501)
maxes <- numeric(501)
for(i in 1:501){
cvfits <- cv.glmnet(X, y, alpha=alphas[i], nfolds=32)
loc <- which(cvfits$lambda==cvfits$lambda.min)
maxes[i] <- cvfits$lambda %>% max
mins[i] <- cvfits$lambda %>% min
mses[i] <- cvfits$cvm[loc]
}
this <- data.frame(mse=mses, alpha=alphas)
plot1 <- ggplot(this, aes(x=alpha, y=mse)) +
geom_point(shape=1) +
#geom_smooth() +
ylab("LOOCV mean squared error") +
xlab("alpha parameter") +
ggtitle("model error of highest performing regularized elastic-net
regression as a function of alpha parameter
<using mtcars predicting mpg>")
# kitchen.sink <- glm(mpg ~ ., data=mtcars)
# cv.glm(mtcars, kitchen.sink) # 12.02194
errors <- numeric(nrow(mtcars))
for(i in 1:nrow(mtcars)){
train <- mtcars[-i,]
test <- mtcars[i,]
kitchen.sink <- glm(mpg ~ ., data=train)
the.pred <- predict(kitchen.sink, newdata=test)
errors[i] <- (the.pred - test$mpg)^2
}
mean(errors) # 12.18156
full <- glm(mpg ~ ., data=mtcars)
stepped <- step(full, direction = "both")
sub.coeffs <- names(stepped$coefficients[-1])
errors <- numeric(nrow(mtcars))
for(i in 1:nrow(mtcars)){
train <- mtcars[-i,]
test <- mtcars[i,]
new.feat.sub <- train[, sub.coeffs]
new.feat.sub$mpg <- train$mpg
mod <- glm(mpg ~ ., data=new.feat.sub)
the.pred <- predict(mod, newdata=test)
errors[i] <- (the.pred - test$mpg)^2
}
mean(errors) # 7.228
# substantial variation in selected features
errors <- numeric(nrow(mtcars))
for(i in 1:nrow(mtcars)){
train <- mtcars[-i,]
test <- mtcars[i,]
smaller <- glm(mpg ~ ., data=train)
stepped <- step(smaller, direction="both", trace=0)
new.feat.sub <- train[, names(stepped$coefficients[-1])]
print(names(stepped$coefficients[-1]))
new.feat.sub$mpg <- train$mpg
mod <- glm(mpg ~ ., data=new.feat.sub)
the.pred <- predict(mod, newdata=test)
errors[i] <- (the.pred - test$mpg)^2
}
mean(errors) # 13.27429
other.errors <- data.frame(method=c("kitchen sink",
"bad stepwise est.",
"better stepwise est."),
errors=c(12.18156, 7.228, 13.27429))
plot2 <- ggplot(this, aes(x=alpha, y=mse)) +
geom_point(shape=1) +
#geom_smooth() +
ylab("LOOCV mean squared error") +
xlab("alpha parameter") +
ggtitle("[with kitchen sink and stepwise LOOCV MSEs]") +
geom_hline(aes(yintercept=errors,
color=method, group=method),
size=2, data=other.errors, show_guide=TRUE)
grid.arrange(plot1, plot2, ncol=2)
#--------------------------------------#
#--------------------------------------#
#--------------------------------------#
# 517 x 11
# mean VIF: 1.727451
fires <- read.csv("./forestfires.csv")
fires$month <- NULL
fires$area <- log(fires$area+1)
X <- model.matrix(area ~ ., data=fires)[,-1]
y <- fires$area
alphas <- seq(0.0, 1, by=.01)
mses <- numeric(101)
mins <- numeric(101)
maxes <- numeric(101)
for(i in 1:101){
cvfits <- cv.glmnet(X, y, alpha=alphas[i], nfolds=400)
loc <- which(cvfits$lambda==cvfits$lambda.min)
maxes[i] <- cvfits$lambda %>% max
mins[i] <- cvfits$lambda %>% min
mses[i] <- cvfits$cvm[loc]
}
this <- data.frame(mse=mses, alpha=alphas)
plot1 <- ggplot(this, aes(x=alpha, y=mse)) +
geom_point(shape=1) +
geom_smooth() +
ylab("400-fold cross validation mean squared error") +
xlab("alpha parameter") +
ggtitle("model error of highest performing regularized
elastic-net regression as a function of alpha parameter
<using forest fire data>")
# kitchen.sink <- glm(area ~ ., data=fires)
# cv.glm(fires, kitchen.sink)$delta[2] # 2.131694
#
# step(kitchen.sink, direction="both") # 4 variables
# stepped <- glm(area ~ X + DMC + RH + wind, data=fires)
# cv.glm(fires, stepped) # 1.951653
#
errors <- numeric(nrow(fires))
for(i in 1:nrow(fires)){
train <- fires[-i,]
test <- fires[i,]
kitchen.sink <- glm(area ~ ., data=train)
the.pred <- predict(kitchen.sink, newdata=test)
errors[i] <- (the.pred - test$area)^2
}
mean(errors) # 2.132028
full <- glm(area ~ ., data=fires)
stepped <- step(full, direction = "both")
sub.coeffs <- names(stepped$coefficients[-1])
errors <- numeric(nrow(fires))
for(i in 1:nrow(fires)){
train <- fires[-i,]
test <- fires[i,]
new.feat.sub <- train[, sub.coeffs]
new.feat.sub$area <- train$area
mod <- glm(area ~ ., data=new.feat.sub)
the.pred <- predict(mod, newdata=test)
errors[i] <- (the.pred - test$area)^2
}
mean(errors) # 1.951689
# much less variation in selected features
errors <- numeric(nrow(fires))
for(i in 1:nrow(fires)){
train <- fires[-i,]
test <- fires[i,]
smaller <- glm(area ~ ., data=train)
stepped <- step(smaller, direction="both", trace=0)
new.feat.sub <- train[, names(stepped$coefficients[-1])]
print(names(stepped$coefficients[-1]))
new.feat.sub$area <- train$area
mod <- glm(area ~ ., data=new.feat.sub)
the.pred <- predict(mod, newdata=test)
errors[i] <- (the.pred - test$area)^2
}
mean(errors) # 2.151955
other.errors <- data.frame(method=c("kitchen sink",
"bad stepwise est.",
"better stepwise est."),
errors=c(2.132028, 1.951689, 2.151955))
plot2 <- ggplot(this, aes(x=alpha, y=mse)) +
geom_point(shape=1) +
geom_smooth() +
ylab("400-fold cross validation mean squared error") +
xlab("alpha parameter") +
ggtitle("[with kitchen sink and stepwise LOOCV MSEs]") +
geom_hline(aes(yintercept=errors,
color=method, group=method),
size=2, data=other.errors, show_guide=TRUE)
plot2 <- plot2 + ylab("400-fold cross validation mean squared error")
grid.arrange(plot1, plot2, ncol=2)
#--------------------------------------#
#--------------------------------------#
#--------------------------------------#
# 100x20
# mean vif: 1.225789
data(QuickStartExample)
X <- x
mock.frame <- data.frame(X)
mock.frame$target <- y
alphas <- seq(0, 1, by=.002)
mses <- numeric(501)
mins <- numeric(501)
maxes <- numeric(501)
for(i in 1:501){
cvfits <- cv.glmnet(X, y, alpha=alphas[i], nfolds=100)
loc <- which(cvfits$lambda==cvfits$lambda.min)
maxes[i] <- cvfits$lambda %>% max
mins[i] <- cvfits$lambda %>% min
mses[i] <- cvfits$cvm[loc]
}
this <- data.frame(mse=mses, alpha=alphas)
plot1 <- ggplot(this, aes(x=alpha, y=mse)) +
geom_point(shape=1) +
#geom_smooth() +
ylab("LOOCV mean squared error") +
xlab("alpha parameter") +
ggtitle("model error of highest performing regularized
elastic-net regression as a function of alpha parameter
<'quickstart' glmnet data>")
# kitchen.sink <- glm(target ~ ., data=mock.frame)
# cv.glm(mock.frame, kitchen.sink)$delta[2] # 1.142114
#
# step(kitchen.sink, direction="both") # 8 variables
# stepped <- glm(target ~ X1 + X3 + X5 + X6 + X8 + X11 + X14 + X20, data=mock.frame)
# cv.glm(mock.frame, stepped) # 0.9433261
errors <- numeric(nrow(mock.frame))
for(i in 1:nrow(mock.frame)){
train <- mock.frame[-i,]
test <- mock.frame[i,]
kitchen.sink <- glm(target ~ ., data=train)
the.pred <- predict(kitchen.sink, newdata=test)
errors[i] <- (the.pred - test$target)^2
}
mean(errors) # 1.144384
full <- glm(target ~ ., data=mock.frame)
stepped <- step(full, direction = "both")
sub.coeffs <- names(stepped$coefficients[-1])
errors <- numeric(nrow(mock.frame))
for(i in 1:nrow(mock.frame)){
train <- mock.frame[-i,]
test <- mock.frame[i,]
new.feat.sub <- train[, sub.coeffs]
new.feat.sub$target <- train$target
mod <- glm(target ~ ., data=new.feat.sub)
the.pred <- predict(mod, newdata=test)
errors[i] <- (the.pred - test$target)^2
}
mean(errors) # 0.9441541
# virtually no variation in selected features
errors <- numeric(nrow(mock.frame))
for(i in 1:nrow(mock.frame)){
train <- mock.frame[-i,]
test <- mock.frame[i,]
smaller <- glm(target ~ ., data=train)
stepped <- step(smaller, direction="both", trace=0)
new.feat.sub <- train[, names(stepped$coefficients[-1])]
print(names(stepped$coefficients[-1]))
new.feat.sub$target <- train$target
mod <- glm(target ~ ., data=new.feat.sub)
the.pred <- predict(mod, newdata=test)
errors[i] <- (the.pred - test$target)^2
}
mean(errors) # 0.9544712
other.errors <- data.frame(method=c("kitchen sink",
"bad stepwise est.",
"better stepwise est."),
errors=c(1.144384, 0.9441541, 0.9544712))
plot2 <- ggplot(this, aes(x=alpha, y=mse)) +
geom_point(shape=1) +
#geom_smooth() +
ylab("LOOCV mean squared error") +
xlab("alpha parameter") +
ggtitle("[with kitchen sink and stepwise LOOCV MSEs]") +
geom_hline(aes(yintercept=errors,
color=method, group=method),
size=2, data=other.errors, show_guide=TRUE)
grid.arrange(plot1, plot2, ncol=2)
@sahirbhatnagar
Copy link

you might be interested in the c060 package which uses an algorithm to do what you're doing

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment