Created
February 24, 2017 20:55
-
-
Save Laurae2/758b195a0f92ad62cc9d9491b99a228e to your computer and use it in GitHub Desktop.
Forward Stepwise Regression in R without step/lm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Setting up random matrix | |
set.seed(11111) | |
data <- data.frame(a = rnorm(n = 15) * 5, | |
b = rnorm(n = 15) * 3 + 1, | |
c = rnorm(n = 15) * 2 + 2) | |
# Setting up the (perfect) linear relationship | |
preds <- 2 + (data[, 1] * 2) + (data[, 2] * 3) + (data[, 3] * 4) + (data[, 3] ^ 2) + (data[, 1] * data[, 2]) | |
# Setting up polynomial features | |
columns <- ncol(data) | |
for (i in 1:columns) { | |
data[, paste0(colnames(data)[i], "X", colnames(data)[i])] <- data[, i] * data[, i] | |
for (j in i:columns) { | |
data[, paste0(colnames(data)[i], "X", colnames(data)[j])] <- data[, i] * data[, j] | |
} | |
} | |
data <- as.matrix(cbind(Intercept = 1, data)) | |
features <- c(TRUE, rep(FALSE, 9)) | |
for (i in 1:ncol(data)) { | |
# Create temporary IC vector | |
IC <- rep(0, sum(!features) + 1) | |
# Loop for adding NO feature | |
temp_data <- as.matrix(data[, features]) | |
# Getting linear regression coefficients | |
coefficients <- solve(t(temp_data) %*% temp_data, tol = 1e-30) %*% t(temp_data) %*% preds | |
# Predicting on data | |
values <- temp_data %*% coefficients | |
rss <- sum((preds - values) ^ 2) | |
# Compute IC | |
IC[1] <- nrow(temp_data) * (log((rss / nrow(temp_data)))) + ((length(coefficients)) * 2) | |
#IC[1] <- nrow(temp_data) * (log((rss / nrow(temp_data)))) + ((length(coefficients)) * log(nrow(data))) | |
cat("IC for Null feature: ", IC[1], ", RSS=", rss, "\n", sep = "") | |
# Make vector of allowed features to add | |
features_to_add <- which(!features) | |
for (j in features_to_add) { | |
# Loop for adding one feature to compare later | |
new_features <- features | |
new_features[j] <- TRUE | |
temp_data <- as.matrix(data[, new_features]) | |
# Getting linear regression coefficients | |
coefficients <- solve(t(temp_data) %*% temp_data, tol = 1e-30) %*% t(temp_data) %*% preds | |
# Predicting on data | |
values <- temp_data %*% coefficients | |
rss <- sum((preds - values) ^ 2) | |
# Compute IC | |
IC[which(j == features_to_add) + 1] <- nrow(temp_data) * (log((rss / nrow(temp_data)))) + ((length(coefficients)) * 2) | |
#IC[which(j == features_to_add) + 1] <- nrow(temp_data) * (log((rss / nrow(temp_data)))) + ((length(coefficients)) * log(nrow(data))) | |
cat("IC for feature ", colnames(data)[j], ": ", IC[which(j == features_to_add) + 1], ", RSS=", rss, "\n", sep = "") | |
} | |
# Perform model selection (select best feature) | |
feature_selected <- which.min(IC) | |
if (feature_selected == 1) { | |
cat("Selected no feature, with IC=", IC[feature_selected], ". Interrupting now.", sep = "") | |
break() | |
} else { | |
cat("Selecting feature ", colnames(data)[features_to_add[feature_selected - 1]], " with IC=", IC[feature_selected], "\n\n", sep = "") | |
} | |
# Add the best feature to the feature list | |
features[features_to_add[feature_selected - 1]] <- TRUE | |
} | |
# Do final learning | |
temp_data <- as.matrix(data[, features]) | |
# Getting linear regression coefficients | |
coefficients <- solve(t(temp_data) %*% temp_data, tol = 1e-30) %*% t(temp_data) %*% preds | |
names(coefficients) <- colnames(data)[which(features)] | |
# Predicting on data | |
values <- temp_data %*% coefficients | |
rss <- sum((preds - values) ^ 2) | |
# Print stuff | |
print(coefficients) | |
cat("Sum of squared errors: ", rss, sep = "") | |
# Compare with R stepwise regression | |
step(lm(preds ~ 1, data = data.frame(data[, 2:10], preds)), as.formula(paste0("preds ~ ", paste(colnames(data)[2:10], collapse = "+"))), direction = "forward", k = 2) | |
step(lm(preds ~ 1, data = data.frame(data[, 2:10], preds)), as.formula(paste0("preds ~ ", paste(colnames(data)[2:10], collapse = "+"))), direction = "forward", k = log(nrow(data))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment