diamonaj · February 9, 2023 15:54
diff --git a/gistfile1.txt b/gistfile1.txt
 install.packages("tree")
 library(MASS)
 library(tree)
 head(Pima.tr)

 #############

 set.seed(1234)
 # enter your code here
 # fit a tree in the training set
 basic_tree <- tree(type~., data=Pima.tr) # fill in the ***

 # use "basic tree" model above to predict in the test set
 predict_basic_tree <- predict(
    basic_tree, newdata = Pima.te, type = "class")

 # produce a confusion matrix showing accuracy and error
 cat("\n By the way, let's obtain the test set error\n\n")
 table(Pima.te$type, predict_basic_tree)
 # 170 true Neg are actually Neg
 # 65 true Pos are actually Pos
 # the rest are errors, mistakes (44 + 53)
 ## this is accuracy, or error for the test set error
 ## with a simple tree (not cross-validated)


 accuracy <- (170+65) / (170 + 53 + 44 + 65)
 error_rate <- 1 - accuracy

 accuracy
 error_rate

 ######################################################
 cat("\n By the way, let's obtain the training set error\n\n")
 # again, this is for the simple tree (not cross-validated)
 predict_basic_tree.tr <- predict(
    basic_tree, newdata = Pima.tr, type = "class")

 summary(predict_basic_tree.tr)
 table(Pima.tr$type, predict_basic_tree.tr)
 # 122 + 55 are correctly classified
 # 13 + 10 are mistakenly classified
 accuracy.tr <- (122 + 55) / (122 + 55 + 23)
 plot(basic_tree)
 text(basic_tree)
 summary(basic_tree)

 ########################

 cat("\nThe estimated test set error rate for the basic tree (before cross-validating) is:", round(error_rate*100), "%\n\n")

 cat("\nThe estimated training set error rate for the basic tree (before cross-validating) is:", round((1 - accuracy.tr)*100), "%\n\n")


 ###############

 ## Now we're going to figure out how much we should prune the tree...

 ## Pruned tree ##
 set.seed(234235)
 pruned_tree <- cv.tree(
    basic_tree, FUN=prune.misclass) 

 print(pruned_tree)
 # enter your code here

 # let's plot the pruning result
 plot(x = pruned_tree$size, y = pruned_tree$dev,
     main = "Relationship between Tree Size and Error",
    type = "b")

 # label the points with value of alpha (also known as k)
 text(x = pruned_tree$size, y = pruned_tree$dev,
     labels = round(pruned_tree$k, 2), 
     pos = 4, col = "darkred", cex = 1.2)

 #########################################

 # you now have information about how much to grow the tree
 # so refit your training set: fill in the "***"
 # where *** = the optimal number of nodes
 best_tree <- prune.misclass(basic_tree, best = 11)

 summary(best_tree)
 plot(best_tree)
 text(best_tree)

 #######################

 # and now, bring that model to your test set.
 # produce another confusion matrix
 predict_best_tree <- predict(best_tree, newdata = Pima.te, type = "class")
 plot(best_tree)
 text(best_tree)
 # produce a confusion matrix showing accuracy and error
 cat("\nHere is a test set accuracy rate for the cross-validated tree with 11 terminal nodes\n")
 table(Pima.te$type, predict_best_tree)

 accuracy_rate_1 = (173 + 72) / (173 + 50 + 37 + 72)
 error_rate_1 = 1 - accuracy_rate_1

 cat("\nThe test error rate for our cross-validated tree =", round(100*(1-accuracy_rate_1)), "%\n")

 ########################
	install.packages("tree")
	library(MASS)
	library(tree)
	head(Pima.tr)

	#############

	set.seed(1234)
	# enter your code here
	# fit a tree in the training set
	basic_tree <- tree(type~., data=Pima.tr) # fill in the ***

	# use "basic tree" model above to predict in the test set
	predict_basic_tree <- predict(
	basic_tree, newdata = Pima.te, type = "class")

	# produce a confusion matrix showing accuracy and error
	cat("\n By the way, let's obtain the test set error\n\n")
	table(Pima.te$type, predict_basic_tree)
	# 170 true Neg are actually Neg
	# 65 true Pos are actually Pos
	# the rest are errors, mistakes (44 + 53)
	## this is accuracy, or error for the test set error
	## with a simple tree (not cross-validated)


	accuracy <- (170+65) / (170 + 53 + 44 + 65)
	error_rate <- 1 - accuracy

	accuracy
	error_rate

	######################################################
	cat("\n By the way, let's obtain the training set error\n\n")
	# again, this is for the simple tree (not cross-validated)
	predict_basic_tree.tr <- predict(
	basic_tree, newdata = Pima.tr, type = "class")

	summary(predict_basic_tree.tr)
	table(Pima.tr$type, predict_basic_tree.tr)
	# 122 + 55 are correctly classified
	# 13 + 10 are mistakenly classified
	accuracy.tr <- (122 + 55) / (122 + 55 + 23)
	plot(basic_tree)
	text(basic_tree)
	summary(basic_tree)

	########################

	cat("\nThe estimated test set error rate for the basic tree (before cross-validating) is:", round(error_rate*100), "%\n\n")

	cat("\nThe estimated training set error rate for the basic tree (before cross-validating) is:", round((1 - accuracy.tr)*100), "%\n\n")


	###############

	## Now we're going to figure out how much we should prune the tree...

	## Pruned tree ##
	set.seed(234235)
	pruned_tree <- cv.tree(
	basic_tree, FUN=prune.misclass)

	print(pruned_tree)
	# enter your code here

	# let's plot the pruning result
	plot(x = pruned_tree$size, y = pruned_tree$dev,
	main = "Relationship between Tree Size and Error",
	type = "b")

	# label the points with value of alpha (also known as k)
	text(x = pruned_tree$size, y = pruned_tree$dev,
	labels = round(pruned_tree$k, 2),
	pos = 4, col = "darkred", cex = 1.2)

	#########################################

	# you now have information about how much to grow the tree
	# so refit your training set: fill in the "***"
	# where *** = the optimal number of nodes
	best_tree <- prune.misclass(basic_tree, best = 11)

	summary(best_tree)
	plot(best_tree)
	text(best_tree)

	#######################

	# and now, bring that model to your test set.
	# produce another confusion matrix
	predict_best_tree <- predict(best_tree, newdata = Pima.te, type = "class")
	plot(best_tree)
	text(best_tree)
	# produce a confusion matrix showing accuracy and error
	cat("\nHere is a test set accuracy rate for the cross-validated tree with 11 terminal nodes\n")
	table(Pima.te$type, predict_best_tree)

	accuracy_rate_1 = (173 + 72) / (173 + 50 + 37 + 72)
	error_rate_1 = 1 - accuracy_rate_1

	cat("\nThe test error rate for our cross-validated tree =", round(100*(1-accuracy_rate_1)), "%\n")

	########################