Skip to content

Instantly share code, notes, and snippets.

@diamonaj
Last active October 19, 2018 12:33
Show Gist options
  • Save diamonaj/ebbd1ac079d8b403ab8ee8e48d0ee2b7 to your computer and use it in GitHub Desktop.
Save diamonaj/ebbd1ac079d8b403ab8ee8e48d0ee2b7 to your computer and use it in GitHub Desktop.
Pima RF Exercise
################ PRELIMINARIES
library(MASS)
data(Pima.tr)
library(tree)
library(randomForest)
## STEP 1: Logistic regression ##
logistic_reg <- glm(type ~ ., data = Pima.tr, family = binomial) # basic model
predict_logistic.tr <- predict(logistic_reg, type = "response") # predicted probabilities (TRAINING SET)
# Create a function that evaluates the misclassification rate for TRAINING SET, for any threshold
evaluate_fn <- function(threshold = NA)
{
predicted_outcomes <- as.numeric(predict_logistic.tr > threshold)
table_logistic <- table(Pima.tr$type, predicted_outcomes)
error_rate_logistic <- sum(table_logistic[2:3])/sum(table_logistic)
return(error_rate_logistic)
}
# Optimize for threshold within TRAINING SET
best_threshold <- optim(0.5, evaluate_fn)$par
# Produce predicted probabilities for the test set
predict_logistic <- predict(logistic_reg, newdata = Pima.te, type = "response")
# Convert those predicted probabilities to predicted TEST SET outcomes
predicted_logistic_outcomes <- as.numeric(predict_logistic > best_threshold)
# Measure misclassification error, in TEST SET
table(Pima.te$type, predicted_logistic_outcomes)
table_logistic <- table(Pima.te$type, predicted_logistic_outcomes)
error_rate_logistic <- sum(table_logistic[2:3])/sum(table_logistic)
print(error_rate_logistic)
## Basic tree ##
basic_tree <- tree(type ~., data = Pima.tr)
predict_basic_tree <- predict(basic_tree, newdata = Pima.te, type = "class")
table(Pima.te$type, predict_basic_tree)
table_basic_tree <- table(Pima.te$type, predict_basic_tree)
error_rate_basic_tree <- sum(table_basic_tree[2:3])/sum(table_basic_tree)
print(error_rate_basic_tree)
## Pruned tree ##
pruned_tree <- cv.tree(basic_tree,FUN=prune.misclass)
print(pruned_tree)
pruned_tree <- prune.misclass(basic_tree, k = 1.5)
predict_pruned_tree <- predict(pruned_tree, newdata = Pima.te, type = "class")
#NEXT STEP: Evaluate pruned tree performance in the test set
## Random forest ##
#STEP 1: Run Random Forest model
#STEP 2: Use your model to predict for the test set
#STEP 3: Evaluate model performance in the test set
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment