Skip to content

Instantly share code, notes, and snippets.

@diamonaj
Created February 9, 2023 15:54
Show Gist options
  • Save diamonaj/4dd6ce955aeff711682e81238b76e98a to your computer and use it in GitHub Desktop.
Save diamonaj/4dd6ce955aeff711682e81238b76e98a to your computer and use it in GitHub Desktop.
install.packages("tree")
library(MASS)
library(tree)
head(Pima.tr)
#############
set.seed(1234)
# enter your code here
# fit a tree in the training set
basic_tree <- tree(type~., data=Pima.tr) # fill in the ***
# use "basic tree" model above to predict in the test set
predict_basic_tree <- predict(
basic_tree, newdata = Pima.te, type = "class")
# produce a confusion matrix showing accuracy and error
cat("\n By the way, let's obtain the test set error\n\n")
table(Pima.te$type, predict_basic_tree)
# 170 true Neg are actually Neg
# 65 true Pos are actually Pos
# the rest are errors, mistakes (44 + 53)
## this is accuracy, or error for the test set error
## with a simple tree (not cross-validated)
accuracy <- (170+65) / (170 + 53 + 44 + 65)
error_rate <- 1 - accuracy
accuracy
error_rate
######################################################
cat("\n By the way, let's obtain the training set error\n\n")
# again, this is for the simple tree (not cross-validated)
predict_basic_tree.tr <- predict(
basic_tree, newdata = Pima.tr, type = "class")
summary(predict_basic_tree.tr)
table(Pima.tr$type, predict_basic_tree.tr)
# 122 + 55 are correctly classified
# 13 + 10 are mistakenly classified
accuracy.tr <- (122 + 55) / (122 + 55 + 23)
plot(basic_tree)
text(basic_tree)
summary(basic_tree)
########################
cat("\nThe estimated test set error rate for the basic tree (before cross-validating) is:", round(error_rate*100), "%\n\n")
cat("\nThe estimated training set error rate for the basic tree (before cross-validating) is:", round((1 - accuracy.tr)*100), "%\n\n")
###############
## Now we're going to figure out how much we should prune the tree...
## Pruned tree ##
set.seed(234235)
pruned_tree <- cv.tree(
basic_tree, FUN=prune.misclass)
print(pruned_tree)
# enter your code here
# let's plot the pruning result
plot(x = pruned_tree$size, y = pruned_tree$dev,
main = "Relationship between Tree Size and Error",
type = "b")
# label the points with value of alpha (also known as k)
text(x = pruned_tree$size, y = pruned_tree$dev,
labels = round(pruned_tree$k, 2),
pos = 4, col = "darkred", cex = 1.2)
#########################################
# you now have information about how much to grow the tree
# so refit your training set: fill in the "***"
# where *** = the optimal number of nodes
best_tree <- prune.misclass(basic_tree, best = 11)
summary(best_tree)
plot(best_tree)
text(best_tree)
#######################
# and now, bring that model to your test set.
# produce another confusion matrix
predict_best_tree <- predict(best_tree, newdata = Pima.te, type = "class")
plot(best_tree)
text(best_tree)
# produce a confusion matrix showing accuracy and error
cat("\nHere is a test set accuracy rate for the cross-validated tree with 11 terminal nodes\n")
table(Pima.te$type, predict_best_tree)
accuracy_rate_1 = (173 + 72) / (173 + 50 + 37 + 72)
error_rate_1 = 1 - accuracy_rate_1
cat("\nThe test error rate for our cross-validated tree =", round(100*(1-accuracy_rate_1)), "%\n")
########################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment