Skip to content

Instantly share code, notes, and snippets.

@Laurae2
Created February 24, 2017 20:55
Show Gist options
  • Save Laurae2/758b195a0f92ad62cc9d9491b99a228e to your computer and use it in GitHub Desktop.
Save Laurae2/758b195a0f92ad62cc9d9491b99a228e to your computer and use it in GitHub Desktop.
Forward Stepwise Regression in R without step/lm
# Setting up random matrix
set.seed(11111)
data <- data.frame(a = rnorm(n = 15) * 5,
b = rnorm(n = 15) * 3 + 1,
c = rnorm(n = 15) * 2 + 2)
# Setting up the (perfect) linear relationship
preds <- 2 + (data[, 1] * 2) + (data[, 2] * 3) + (data[, 3] * 4) + (data[, 3] ^ 2) + (data[, 1] * data[, 2])
# Setting up polynomial features
columns <- ncol(data)
for (i in 1:columns) {
data[, paste0(colnames(data)[i], "X", colnames(data)[i])] <- data[, i] * data[, i]
for (j in i:columns) {
data[, paste0(colnames(data)[i], "X", colnames(data)[j])] <- data[, i] * data[, j]
}
}
data <- as.matrix(cbind(Intercept = 1, data))
features <- c(TRUE, rep(FALSE, 9))
for (i in 1:ncol(data)) {
# Create temporary IC vector
IC <- rep(0, sum(!features) + 1)
# Loop for adding NO feature
temp_data <- as.matrix(data[, features])
# Getting linear regression coefficients
coefficients <- solve(t(temp_data) %*% temp_data, tol = 1e-30) %*% t(temp_data) %*% preds
# Predicting on data
values <- temp_data %*% coefficients
rss <- sum((preds - values) ^ 2)
# Compute IC
IC[1] <- nrow(temp_data) * (log((rss / nrow(temp_data)))) + ((length(coefficients)) * 2)
#IC[1] <- nrow(temp_data) * (log((rss / nrow(temp_data)))) + ((length(coefficients)) * log(nrow(data)))
cat("IC for Null feature: ", IC[1], ", RSS=", rss, "\n", sep = "")
# Make vector of allowed features to add
features_to_add <- which(!features)
for (j in features_to_add) {
# Loop for adding one feature to compare later
new_features <- features
new_features[j] <- TRUE
temp_data <- as.matrix(data[, new_features])
# Getting linear regression coefficients
coefficients <- solve(t(temp_data) %*% temp_data, tol = 1e-30) %*% t(temp_data) %*% preds
# Predicting on data
values <- temp_data %*% coefficients
rss <- sum((preds - values) ^ 2)
# Compute IC
IC[which(j == features_to_add) + 1] <- nrow(temp_data) * (log((rss / nrow(temp_data)))) + ((length(coefficients)) * 2)
#IC[which(j == features_to_add) + 1] <- nrow(temp_data) * (log((rss / nrow(temp_data)))) + ((length(coefficients)) * log(nrow(data)))
cat("IC for feature ", colnames(data)[j], ": ", IC[which(j == features_to_add) + 1], ", RSS=", rss, "\n", sep = "")
}
# Perform model selection (select best feature)
feature_selected <- which.min(IC)
if (feature_selected == 1) {
cat("Selected no feature, with IC=", IC[feature_selected], ". Interrupting now.", sep = "")
break()
} else {
cat("Selecting feature ", colnames(data)[features_to_add[feature_selected - 1]], " with IC=", IC[feature_selected], "\n\n", sep = "")
}
# Add the best feature to the feature list
features[features_to_add[feature_selected - 1]] <- TRUE
}
# Do final learning
temp_data <- as.matrix(data[, features])
# Getting linear regression coefficients
coefficients <- solve(t(temp_data) %*% temp_data, tol = 1e-30) %*% t(temp_data) %*% preds
names(coefficients) <- colnames(data)[which(features)]
# Predicting on data
values <- temp_data %*% coefficients
rss <- sum((preds - values) ^ 2)
# Print stuff
print(coefficients)
cat("Sum of squared errors: ", rss, sep = "")
# Compare with R stepwise regression
step(lm(preds ~ 1, data = data.frame(data[, 2:10], preds)), as.formula(paste0("preds ~ ", paste(colnames(data)[2:10], collapse = "+"))), direction = "forward", k = 2)
step(lm(preds ~ 1, data = data.frame(data[, 2:10], preds)), as.formula(paste0("preds ~ ", paste(colnames(data)[2:10], collapse = "+"))), direction = "forward", k = log(nrow(data)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment