Created
February 9, 2023 15:54
-
-
Save diamonaj/4dd6ce955aeff711682e81238b76e98a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
install.packages("tree") | |
library(MASS) | |
library(tree) | |
head(Pima.tr) | |
############# | |
set.seed(1234) | |
# enter your code here | |
# fit a tree in the training set | |
basic_tree <- tree(type~., data=Pima.tr) # fill in the *** | |
# use "basic tree" model above to predict in the test set | |
predict_basic_tree <- predict( | |
basic_tree, newdata = Pima.te, type = "class") | |
# produce a confusion matrix showing accuracy and error | |
cat("\n By the way, let's obtain the test set error\n\n") | |
table(Pima.te$type, predict_basic_tree) | |
# 170 true Neg are actually Neg | |
# 65 true Pos are actually Pos | |
# the rest are errors, mistakes (44 + 53) | |
## this is accuracy, or error for the test set error | |
## with a simple tree (not cross-validated) | |
accuracy <- (170+65) / (170 + 53 + 44 + 65) | |
error_rate <- 1 - accuracy | |
accuracy | |
error_rate | |
###################################################### | |
cat("\n By the way, let's obtain the training set error\n\n") | |
# again, this is for the simple tree (not cross-validated) | |
predict_basic_tree.tr <- predict( | |
basic_tree, newdata = Pima.tr, type = "class") | |
summary(predict_basic_tree.tr) | |
table(Pima.tr$type, predict_basic_tree.tr) | |
# 122 + 55 are correctly classified | |
# 13 + 10 are mistakenly classified | |
accuracy.tr <- (122 + 55) / (122 + 55 + 23) | |
plot(basic_tree) | |
text(basic_tree) | |
summary(basic_tree) | |
######################## | |
cat("\nThe estimated test set error rate for the basic tree (before cross-validating) is:", round(error_rate*100), "%\n\n") | |
cat("\nThe estimated training set error rate for the basic tree (before cross-validating) is:", round((1 - accuracy.tr)*100), "%\n\n") | |
############### | |
## Now we're going to figure out how much we should prune the tree... | |
## Pruned tree ## | |
set.seed(234235) | |
pruned_tree <- cv.tree( | |
basic_tree, FUN=prune.misclass) | |
print(pruned_tree) | |
# enter your code here | |
# let's plot the pruning result | |
plot(x = pruned_tree$size, y = pruned_tree$dev, | |
main = "Relationship between Tree Size and Error", | |
type = "b") | |
# label the points with value of alpha (also known as k) | |
text(x = pruned_tree$size, y = pruned_tree$dev, | |
labels = round(pruned_tree$k, 2), | |
pos = 4, col = "darkred", cex = 1.2) | |
######################################### | |
# you now have information about how much to grow the tree | |
# so refit your training set: fill in the "***" | |
# where *** = the optimal number of nodes | |
best_tree <- prune.misclass(basic_tree, best = 11) | |
summary(best_tree) | |
plot(best_tree) | |
text(best_tree) | |
####################### | |
# and now, bring that model to your test set. | |
# produce another confusion matrix | |
predict_best_tree <- predict(best_tree, newdata = Pima.te, type = "class") | |
plot(best_tree) | |
text(best_tree) | |
# produce a confusion matrix showing accuracy and error | |
cat("\nHere is a test set accuracy rate for the cross-validated tree with 11 terminal nodes\n") | |
table(Pima.te$type, predict_best_tree) | |
accuracy_rate_1 = (173 + 72) / (173 + 50 + 37 + 72) | |
error_rate_1 = 1 - accuracy_rate_1 | |
cat("\nThe test error rate for our cross-validated tree =", round(100*(1-accuracy_rate_1)), "%\n") | |
######################## | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment