tonyfischetti · April 27, 2021 13:24 · sahirbhatnagar · Aug 20, 2015
diff --git a/elastic-net-regression-evaluation.R b/elastic-net-regression-evaluation.R
 #!/usr/bin/Rscript --vanilla

 ###########################################################
 ##                                                       ##
 ##   elastic.R                                           ##
 ##                                                       ##
 ##                Author: Tony Fischetti                 ##
 ##                        [email protected]       ##
 ##                                                       ##
 ###########################################################

 # workspace cleanup
 rm(list=ls())

 # options
 options(echo=TRUE)
 options(stringsAsFactors=FALSE)

 # cli args
 args <- commandArgs(trailingOnly=TRUE)

 # libraries
 library(dplyr)
 library(magrittr)
 library(assertr)
 library(tidyr)
 library(ggplot2)
 library(car)

 library(glmnet)
 library(boot)

 library(gridExtra)


 # 32x10
 # mean vif: 9.577414
 X <- model.matrix(mpg ~ ., data=mtcars)[,-1]
 y <- mtcars$mpg

 alphas <- seq(0, 1, by=.002)
 mses <- numeric(501)
 mins <- numeric(501)
 maxes <- numeric(501)

 for(i in 1:501){
  cvfits <- cv.glmnet(X, y, alpha=alphas[i], nfolds=32)
  loc <- which(cvfits$lambda==cvfits$lambda.min)
  maxes[i] <- cvfits$lambda %>% max
  mins[i] <- cvfits$lambda %>% min
  mses[i] <- cvfits$cvm[loc]
 }

 this <- data.frame(mse=mses, alpha=alphas)

 plot1 <- ggplot(this, aes(x=alpha, y=mse)) +
  geom_point(shape=1) +
  #geom_smooth() +
  ylab("LOOCV mean squared error") +
  xlab("alpha parameter") +
  ggtitle("model error of highest performing regularized elastic-net
          regression as a function of alpha parameter
          <using mtcars predicting mpg>")


 # kitchen.sink <- glm(mpg ~ ., data=mtcars)
 # cv.glm(mtcars, kitchen.sink)  # 12.02194

 errors <- numeric(nrow(mtcars))
 for(i in 1:nrow(mtcars)){
  train <- mtcars[-i,]
  test <- mtcars[i,]
  kitchen.sink <- glm(mpg ~ ., data=train)
  the.pred <- predict(kitchen.sink, newdata=test)
  errors[i] <- (the.pred - test$mpg)^2
 }
 mean(errors)     # 12.18156



 full <- glm(mpg ~ ., data=mtcars)
 stepped <- step(full, direction = "both")
 sub.coeffs <- names(stepped$coefficients[-1])

 errors <- numeric(nrow(mtcars))
 for(i in 1:nrow(mtcars)){
  train <- mtcars[-i,]
  test <- mtcars[i,]
  new.feat.sub <- train[, sub.coeffs]
  new.feat.sub$mpg <- train$mpg
  mod <- glm(mpg ~ ., data=new.feat.sub)
  the.pred <- predict(mod, newdata=test)
  errors[i] <- (the.pred - test$mpg)^2
 }
 mean(errors)     # 7.228

 # substantial variation in selected features
 errors <- numeric(nrow(mtcars))
 for(i in 1:nrow(mtcars)){
  train <- mtcars[-i,]
  test <- mtcars[i,]
  smaller <- glm(mpg ~ ., data=train)
  stepped <- step(smaller, direction="both", trace=0)
  new.feat.sub <- train[, names(stepped$coefficients[-1])]
  print(names(stepped$coefficients[-1]))
  new.feat.sub$mpg <- train$mpg
  mod <- glm(mpg ~ ., data=new.feat.sub)
  the.pred <- predict(mod, newdata=test)
  errors[i] <- (the.pred - test$mpg)^2
 }
 mean(errors)    # 13.27429


 other.errors <- data.frame(method=c("kitchen sink",
                                    "bad stepwise est.",
                                    "better stepwise est."),
                           errors=c(12.18156, 7.228, 13.27429))

 plot2 <- ggplot(this, aes(x=alpha, y=mse)) +
  geom_point(shape=1) +
  #geom_smooth() +
  ylab("LOOCV mean squared error") +
  xlab("alpha parameter") +
  ggtitle("[with kitchen sink and stepwise LOOCV MSEs]") +
  geom_hline(aes(yintercept=errors,
                 color=method, group=method),
             size=2, data=other.errors, show_guide=TRUE)


 grid.arrange(plot1, plot2, ncol=2)


 #--------------------------------------#
 #--------------------------------------#
 #--------------------------------------#

 # 517 x 11
 # mean VIF: 1.727451
 fires <- read.csv("./forestfires.csv")
 fires$month <- NULL
 fires$area <- log(fires$area+1)



 X <- model.matrix(area ~ ., data=fires)[,-1]
 y <- fires$area

 alphas <- seq(0.0, 1, by=.01)
 mses <- numeric(101)
 mins <- numeric(101)
 maxes <- numeric(101)



 for(i in 1:101){
  cvfits <- cv.glmnet(X, y, alpha=alphas[i], nfolds=400)
  loc <- which(cvfits$lambda==cvfits$lambda.min)
  maxes[i] <- cvfits$lambda %>% max
  mins[i] <- cvfits$lambda %>% min
  mses[i] <- cvfits$cvm[loc]
 }

 this <- data.frame(mse=mses, alpha=alphas)

 plot1 <- ggplot(this, aes(x=alpha, y=mse)) +
  geom_point(shape=1) +
  geom_smooth() +
  ylab("400-fold cross validation mean squared error") +
  xlab("alpha parameter") +
  ggtitle("model error of highest performing regularized
          elastic-net regression as a function of alpha parameter
          <using forest fire data>")



 # kitchen.sink <- glm(area ~ ., data=fires)
 # cv.glm(fires, kitchen.sink)$delta[2]   # 2.131694
 # 
 # step(kitchen.sink, direction="both") # 4 variables
 # stepped <- glm(area ~ X + DMC + RH + wind, data=fires)
 # cv.glm(fires, stepped)  # 1.951653
 # 



 errors <- numeric(nrow(fires))
 for(i in 1:nrow(fires)){
  train <- fires[-i,]
  test <- fires[i,]
  kitchen.sink <- glm(area ~ ., data=train)
  the.pred <- predict(kitchen.sink, newdata=test)
  errors[i] <- (the.pred - test$area)^2
 }
 mean(errors)     # 2.132028


 full <- glm(area ~ ., data=fires)
 stepped <- step(full, direction = "both")
 sub.coeffs <- names(stepped$coefficients[-1])

 errors <- numeric(nrow(fires))
 for(i in 1:nrow(fires)){
  train <- fires[-i,]
  test <- fires[i,]
  new.feat.sub <- train[, sub.coeffs]
  new.feat.sub$area <- train$area
  mod <- glm(area ~ ., data=new.feat.sub)
  the.pred <- predict(mod, newdata=test)
  errors[i] <- (the.pred - test$area)^2
 }
 mean(errors)     # 1.951689


 # much less variation in selected features
 errors <- numeric(nrow(fires))
 for(i in 1:nrow(fires)){
  train <- fires[-i,]
  test <- fires[i,]
  smaller <- glm(area ~ ., data=train)
  stepped <- step(smaller, direction="both", trace=0)
  new.feat.sub <- train[, names(stepped$coefficients[-1])]
  print(names(stepped$coefficients[-1]))
  new.feat.sub$area <- train$area
  mod <- glm(area ~ ., data=new.feat.sub)
  the.pred <- predict(mod, newdata=test)
  errors[i] <- (the.pred - test$area)^2
 }
 mean(errors)    # 2.151955


 other.errors <- data.frame(method=c("kitchen sink",
                                    "bad stepwise est.",
                                    "better stepwise est."),
                           errors=c(2.132028, 1.951689, 2.151955))

 plot2 <- ggplot(this, aes(x=alpha, y=mse)) +
  geom_point(shape=1) +
  geom_smooth() +
  ylab("400-fold cross validation mean squared error") +
  xlab("alpha parameter") +
  ggtitle("[with kitchen sink and stepwise LOOCV MSEs]") +
  geom_hline(aes(yintercept=errors,
                 color=method, group=method),
             size=2, data=other.errors, show_guide=TRUE)

 plot2 <- plot2 + ylab("400-fold cross validation mean squared error")


 grid.arrange(plot1, plot2, ncol=2)

 #--------------------------------------#
 #--------------------------------------#
 #--------------------------------------#

 # 100x20
 # mean vif: 1.225789
 data(QuickStartExample)
 X <- x

 mock.frame <- data.frame(X)
 mock.frame$target <- y


 alphas <- seq(0, 1, by=.002)
 mses <- numeric(501)
 mins <- numeric(501)
 maxes <- numeric(501)


 for(i in 1:501){
  cvfits <- cv.glmnet(X, y, alpha=alphas[i], nfolds=100)
  loc <- which(cvfits$lambda==cvfits$lambda.min)
  maxes[i] <- cvfits$lambda %>% max
  mins[i] <- cvfits$lambda %>% min
  mses[i] <- cvfits$cvm[loc]
 }

 this <- data.frame(mse=mses, alpha=alphas)

 plot1 <- ggplot(this, aes(x=alpha, y=mse)) +
  geom_point(shape=1) +
  #geom_smooth() +
  ylab("LOOCV mean squared error") +
  xlab("alpha parameter") +
  ggtitle("model error of highest performing regularized
          elastic-net regression as a function of alpha parameter
          <'quickstart' glmnet data>")



 # kitchen.sink <- glm(target ~ ., data=mock.frame)
 # cv.glm(mock.frame, kitchen.sink)$delta[2]   # 1.142114
 # 
 # step(kitchen.sink, direction="both") # 8 variables
 # stepped <- glm(target ~ X1 + X3 + X5 + X6 + X8 + X11 + X14 + X20, data=mock.frame)
 # cv.glm(mock.frame, stepped)  # 0.9433261


 errors <- numeric(nrow(mock.frame))
 for(i in 1:nrow(mock.frame)){
  train <- mock.frame[-i,]
  test <- mock.frame[i,]
  kitchen.sink <- glm(target ~ ., data=train)
  the.pred <- predict(kitchen.sink, newdata=test)
  errors[i] <- (the.pred - test$target)^2
 }
 mean(errors)     # 1.144384


 full <- glm(target ~ ., data=mock.frame)
 stepped <- step(full, direction = "both")
 sub.coeffs <- names(stepped$coefficients[-1])

 errors <- numeric(nrow(mock.frame))
 for(i in 1:nrow(mock.frame)){
  train <- mock.frame[-i,]
  test <- mock.frame[i,]
  new.feat.sub <- train[, sub.coeffs]
  new.feat.sub$target <- train$target
  mod <- glm(target ~ ., data=new.feat.sub)
  the.pred <- predict(mod, newdata=test)
  errors[i] <- (the.pred - test$target)^2
 }
 mean(errors)     # 0.9441541


 # virtually no variation in selected features
 errors <- numeric(nrow(mock.frame))
 for(i in 1:nrow(mock.frame)){
  train <- mock.frame[-i,]
  test <- mock.frame[i,]
  smaller <- glm(target ~ ., data=train)
  stepped <- step(smaller, direction="both", trace=0)
  new.feat.sub <- train[, names(stepped$coefficients[-1])]
  print(names(stepped$coefficients[-1]))
  new.feat.sub$target <- train$target
  mod <- glm(target ~ ., data=new.feat.sub)
  the.pred <- predict(mod, newdata=test)
  errors[i] <- (the.pred - test$target)^2
 }
 mean(errors)    # 0.9544712



 other.errors <- data.frame(method=c("kitchen sink",
                                    "bad stepwise est.",
                                    "better stepwise est."),
                           errors=c(1.144384, 0.9441541, 0.9544712))

 plot2 <- ggplot(this, aes(x=alpha, y=mse)) +
  geom_point(shape=1) +
  #geom_smooth() +
  ylab("LOOCV mean squared error") +
  xlab("alpha parameter") +
  ggtitle("[with kitchen sink and stepwise LOOCV MSEs]") +
  geom_hline(aes(yintercept=errors,
                 color=method, group=method),
             size=2, data=other.errors, show_guide=TRUE)


 grid.arrange(plot1, plot2, ncol=2)
	#!/usr/bin/Rscript --vanilla

	###########################################################
	## ##
	## elastic.R ##
	## ##
	## Author: Tony Fischetti ##
	## [email protected] ##
	## ##
	###########################################################

	# workspace cleanup
	rm(list=ls())

	# options
	options(echo=TRUE)
	options(stringsAsFactors=FALSE)

	# cli args
	args <- commandArgs(trailingOnly=TRUE)

	# libraries
	library(dplyr)
	library(magrittr)
	library(assertr)
	library(tidyr)
	library(ggplot2)
	library(car)

	library(glmnet)
	library(boot)

	library(gridExtra)


	# 32x10
	# mean vif: 9.577414
	X <- model.matrix(mpg ~ ., data=mtcars)[,-1]
	y <- mtcars$mpg

	alphas <- seq(0, 1, by=.002)
	mses <- numeric(501)
	mins <- numeric(501)
	maxes <- numeric(501)

	for(i in 1:501){
	cvfits <- cv.glmnet(X, y, alpha=alphas[i], nfolds=32)
	loc <- which(cvfits$lambda==cvfits$lambda.min)
	maxes[i] <- cvfits$lambda %>% max
	mins[i] <- cvfits$lambda %>% min
	mses[i] <- cvfits$cvm[loc]
	}

	this <- data.frame(mse=mses, alpha=alphas)

	plot1 <- ggplot(this, aes(x=alpha, y=mse)) +
	geom_point(shape=1) +
	#geom_smooth() +
	ylab("LOOCV mean squared error") +
	xlab("alpha parameter") +
	ggtitle("model error of highest performing regularized elastic-net
	regression as a function of alpha parameter
	<using mtcars predicting mpg>")


	# kitchen.sink <- glm(mpg ~ ., data=mtcars)
	# cv.glm(mtcars, kitchen.sink) # 12.02194

	errors <- numeric(nrow(mtcars))
	for(i in 1:nrow(mtcars)){
	train <- mtcars[-i,]
	test <- mtcars[i,]
	kitchen.sink <- glm(mpg ~ ., data=train)
	the.pred <- predict(kitchen.sink, newdata=test)
	errors[i] <- (the.pred - test$mpg)^2
	}
	mean(errors) # 12.18156



	full <- glm(mpg ~ ., data=mtcars)
	stepped <- step(full, direction = "both")
	sub.coeffs <- names(stepped$coefficients[-1])

	errors <- numeric(nrow(mtcars))
	for(i in 1:nrow(mtcars)){
	train <- mtcars[-i,]
	test <- mtcars[i,]
	new.feat.sub <- train[, sub.coeffs]
	new.feat.sub$mpg <- train$mpg
	mod <- glm(mpg ~ ., data=new.feat.sub)
	the.pred <- predict(mod, newdata=test)
	errors[i] <- (the.pred - test$mpg)^2
	}
	mean(errors) # 7.228

	# substantial variation in selected features
	errors <- numeric(nrow(mtcars))
	for(i in 1:nrow(mtcars)){
	train <- mtcars[-i,]
	test <- mtcars[i,]
	smaller <- glm(mpg ~ ., data=train)
	stepped <- step(smaller, direction="both", trace=0)
	new.feat.sub <- train[, names(stepped$coefficients[-1])]
	print(names(stepped$coefficients[-1]))
	new.feat.sub$mpg <- train$mpg
	mod <- glm(mpg ~ ., data=new.feat.sub)
	the.pred <- predict(mod, newdata=test)
	errors[i] <- (the.pred - test$mpg)^2
	}
	mean(errors) # 13.27429


	other.errors <- data.frame(method=c("kitchen sink",
	"bad stepwise est.",
	"better stepwise est."),
	errors=c(12.18156, 7.228, 13.27429))

	plot2 <- ggplot(this, aes(x=alpha, y=mse)) +
	geom_point(shape=1) +
	#geom_smooth() +
	ylab("LOOCV mean squared error") +
	xlab("alpha parameter") +
	ggtitle("[with kitchen sink and stepwise LOOCV MSEs]") +
	geom_hline(aes(yintercept=errors,
	color=method, group=method),
	size=2, data=other.errors, show_guide=TRUE)


	grid.arrange(plot1, plot2, ncol=2)


	#--------------------------------------#
	#--------------------------------------#
	#--------------------------------------#

	# 517 x 11
	# mean VIF: 1.727451
	fires <- read.csv("./forestfires.csv")
	fires$month <- NULL
	fires$area <- log(fires$area+1)



	X <- model.matrix(area ~ ., data=fires)[,-1]
	y <- fires$area

	alphas <- seq(0.0, 1, by=.01)
	mses <- numeric(101)
	mins <- numeric(101)
	maxes <- numeric(101)



	for(i in 1:101){
	cvfits <- cv.glmnet(X, y, alpha=alphas[i], nfolds=400)
	loc <- which(cvfits$lambda==cvfits$lambda.min)
	maxes[i] <- cvfits$lambda %>% max
	mins[i] <- cvfits$lambda %>% min
	mses[i] <- cvfits$cvm[loc]
	}

	this <- data.frame(mse=mses, alpha=alphas)

	plot1 <- ggplot(this, aes(x=alpha, y=mse)) +
	geom_point(shape=1) +
	geom_smooth() +
	ylab("400-fold cross validation mean squared error") +
	xlab("alpha parameter") +
	ggtitle("model error of highest performing regularized
	elastic-net regression as a function of alpha parameter
	<using forest fire data>")



	# kitchen.sink <- glm(area ~ ., data=fires)
	# cv.glm(fires, kitchen.sink)$delta[2] # 2.131694
	#
	# step(kitchen.sink, direction="both") # 4 variables
	# stepped <- glm(area ~ X + DMC + RH + wind, data=fires)
	# cv.glm(fires, stepped) # 1.951653
	#



	errors <- numeric(nrow(fires))
	for(i in 1:nrow(fires)){
	train <- fires[-i,]
	test <- fires[i,]
	kitchen.sink <- glm(area ~ ., data=train)
	the.pred <- predict(kitchen.sink, newdata=test)
	errors[i] <- (the.pred - test$area)^2
	}
	mean(errors) # 2.132028


	full <- glm(area ~ ., data=fires)
	stepped <- step(full, direction = "both")
	sub.coeffs <- names(stepped$coefficients[-1])

	errors <- numeric(nrow(fires))
	for(i in 1:nrow(fires)){
	train <- fires[-i,]
	test <- fires[i,]
	new.feat.sub <- train[, sub.coeffs]
	new.feat.sub$area <- train$area
	mod <- glm(area ~ ., data=new.feat.sub)
	the.pred <- predict(mod, newdata=test)
	errors[i] <- (the.pred - test$area)^2
	}
	mean(errors) # 1.951689


	# much less variation in selected features
	errors <- numeric(nrow(fires))
	for(i in 1:nrow(fires)){
	train <- fires[-i,]
	test <- fires[i,]
	smaller <- glm(area ~ ., data=train)
	stepped <- step(smaller, direction="both", trace=0)
	new.feat.sub <- train[, names(stepped$coefficients[-1])]
	print(names(stepped$coefficients[-1]))
	new.feat.sub$area <- train$area
	mod <- glm(area ~ ., data=new.feat.sub)
	the.pred <- predict(mod, newdata=test)
	errors[i] <- (the.pred - test$area)^2
	}
	mean(errors) # 2.151955


	other.errors <- data.frame(method=c("kitchen sink",
	"bad stepwise est.",
	"better stepwise est."),
	errors=c(2.132028, 1.951689, 2.151955))

	plot2 <- ggplot(this, aes(x=alpha, y=mse)) +
	geom_point(shape=1) +
	geom_smooth() +
	ylab("400-fold cross validation mean squared error") +
	xlab("alpha parameter") +
	ggtitle("[with kitchen sink and stepwise LOOCV MSEs]") +
	geom_hline(aes(yintercept=errors,
	color=method, group=method),
	size=2, data=other.errors, show_guide=TRUE)

	plot2 <- plot2 + ylab("400-fold cross validation mean squared error")


	grid.arrange(plot1, plot2, ncol=2)

	#--------------------------------------#
	#--------------------------------------#
	#--------------------------------------#

	# 100x20
	# mean vif: 1.225789
	data(QuickStartExample)
	X <- x

	mock.frame <- data.frame(X)
	mock.frame$target <- y


	alphas <- seq(0, 1, by=.002)
	mses <- numeric(501)
	mins <- numeric(501)
	maxes <- numeric(501)


	for(i in 1:501){
	cvfits <- cv.glmnet(X, y, alpha=alphas[i], nfolds=100)
	loc <- which(cvfits$lambda==cvfits$lambda.min)
	maxes[i] <- cvfits$lambda %>% max
	mins[i] <- cvfits$lambda %>% min
	mses[i] <- cvfits$cvm[loc]
	}

	this <- data.frame(mse=mses, alpha=alphas)

	plot1 <- ggplot(this, aes(x=alpha, y=mse)) +
	geom_point(shape=1) +
	#geom_smooth() +
	ylab("LOOCV mean squared error") +
	xlab("alpha parameter") +
	ggtitle("model error of highest performing regularized
	elastic-net regression as a function of alpha parameter
	<'quickstart' glmnet data>")



	# kitchen.sink <- glm(target ~ ., data=mock.frame)
	# cv.glm(mock.frame, kitchen.sink)$delta[2] # 1.142114
	#
	# step(kitchen.sink, direction="both") # 8 variables
	# stepped <- glm(target ~ X1 + X3 + X5 + X6 + X8 + X11 + X14 + X20, data=mock.frame)
	# cv.glm(mock.frame, stepped) # 0.9433261


	errors <- numeric(nrow(mock.frame))
	for(i in 1:nrow(mock.frame)){
	train <- mock.frame[-i,]
	test <- mock.frame[i,]
	kitchen.sink <- glm(target ~ ., data=train)
	the.pred <- predict(kitchen.sink, newdata=test)
	errors[i] <- (the.pred - test$target)^2
	}
	mean(errors) # 1.144384


	full <- glm(target ~ ., data=mock.frame)
	stepped <- step(full, direction = "both")
	sub.coeffs <- names(stepped$coefficients[-1])

	errors <- numeric(nrow(mock.frame))
	for(i in 1:nrow(mock.frame)){
	train <- mock.frame[-i,]
	test <- mock.frame[i,]
	new.feat.sub <- train[, sub.coeffs]
	new.feat.sub$target <- train$target
	mod <- glm(target ~ ., data=new.feat.sub)
	the.pred <- predict(mod, newdata=test)
	errors[i] <- (the.pred - test$target)^2
	}
	mean(errors) # 0.9441541


	# virtually no variation in selected features
	errors <- numeric(nrow(mock.frame))
	for(i in 1:nrow(mock.frame)){
	train <- mock.frame[-i,]
	test <- mock.frame[i,]
	smaller <- glm(target ~ ., data=train)
	stepped <- step(smaller, direction="both", trace=0)
	new.feat.sub <- train[, names(stepped$coefficients[-1])]
	print(names(stepped$coefficients[-1]))
	new.feat.sub$target <- train$target
	mod <- glm(target ~ ., data=new.feat.sub)
	the.pred <- predict(mod, newdata=test)
	errors[i] <- (the.pred - test$target)^2
	}
	mean(errors) # 0.9544712



	other.errors <- data.frame(method=c("kitchen sink",
	"bad stepwise est.",
	"better stepwise est."),
	errors=c(1.144384, 0.9441541, 0.9544712))

	plot2 <- ggplot(this, aes(x=alpha, y=mse)) +
	geom_point(shape=1) +
	#geom_smooth() +
	ylab("LOOCV mean squared error") +
	xlab("alpha parameter") +
	ggtitle("[with kitchen sink and stepwise LOOCV MSEs]") +
	geom_hline(aes(yintercept=errors,
	color=method, group=method),
	size=2, data=other.errors, show_guide=TRUE)


	grid.arrange(plot1, plot2, ncol=2)