diamonaj · October 19, 2018 12:33
diff --git a/gistfile1.txt b/gistfile1.txt
 ################ PRELIMINARIES
 library(MASS)
 data(Pima.tr)
 library(tree)
 library(randomForest)

 ## STEP 1: Logistic regression ##
 logistic_reg <- glm(type ~ ., data = Pima.tr, family = binomial) # basic model
 predict_logistic.tr <- predict(logistic_reg, type = "response")  # predicted probabilities (TRAINING SET)

 # Create a function that evaluates the misclassification rate for TRAINING SET, for any threshold
 evaluate_fn <- function(threshold = NA)
 {
  predicted_outcomes <- as.numeric(predict_logistic.tr > threshold)
  table_logistic <- table(Pima.tr$type, predicted_outcomes)
  
  error_rate_logistic <- sum(table_logistic[2:3])/sum(table_logistic)
  return(error_rate_logistic)
 }

 # Optimize for threshold within TRAINING SET
 best_threshold <- optim(0.5, evaluate_fn)$par

 # Produce predicted probabilities for the test set
 predict_logistic <- predict(logistic_reg, newdata = Pima.te, type = "response")

 # Convert those predicted probabilities to predicted TEST SET outcomes
 predicted_logistic_outcomes <- as.numeric(predict_logistic > best_threshold)

 # Measure misclassification error, in TEST SET
 table(Pima.te$type, predicted_logistic_outcomes)
 table_logistic <- table(Pima.te$type, predicted_logistic_outcomes)

 error_rate_logistic <- sum(table_logistic[2:3])/sum(table_logistic)
 print(error_rate_logistic)

 ## Basic tree ##
 basic_tree <- tree(type ~., data = Pima.tr)
 predict_basic_tree <- predict(basic_tree, newdata = Pima.te, type = "class")

 table(Pima.te$type, predict_basic_tree)
 table_basic_tree <- table(Pima.te$type, predict_basic_tree)

 error_rate_basic_tree <- sum(table_basic_tree[2:3])/sum(table_basic_tree)
 print(error_rate_basic_tree)

 ## Pruned tree ##
 pruned_tree <- cv.tree(basic_tree,FUN=prune.misclass)
 print(pruned_tree)

 pruned_tree <- prune.misclass(basic_tree, k = 1.5)
 predict_pruned_tree <- predict(pruned_tree, newdata = Pima.te, type = "class")

 #NEXT STEP: Evaluate pruned tree performance in the test set


 ## Random forest ##

 #STEP 1: Run Random Forest model

 #STEP 2: Use your model to predict for the test set
  
 #STEP 3: Evaluate model performance in the test set
	################ PRELIMINARIES
	library(MASS)
	data(Pima.tr)
	library(tree)
	library(randomForest)

	## STEP 1: Logistic regression ##
	logistic_reg <- glm(type ~ ., data = Pima.tr, family = binomial) # basic model
	predict_logistic.tr <- predict(logistic_reg, type = "response") # predicted probabilities (TRAINING SET)

	# Create a function that evaluates the misclassification rate for TRAINING SET, for any threshold
	evaluate_fn <- function(threshold = NA)
	{
	predicted_outcomes <- as.numeric(predict_logistic.tr > threshold)
	table_logistic <- table(Pima.tr$type, predicted_outcomes)

	error_rate_logistic <- sum(table_logistic[2:3])/sum(table_logistic)
	return(error_rate_logistic)
	}

	# Optimize for threshold within TRAINING SET
	best_threshold <- optim(0.5, evaluate_fn)$par

	# Produce predicted probabilities for the test set
	predict_logistic <- predict(logistic_reg, newdata = Pima.te, type = "response")

	# Convert those predicted probabilities to predicted TEST SET outcomes
	predicted_logistic_outcomes <- as.numeric(predict_logistic > best_threshold)

	# Measure misclassification error, in TEST SET
	table(Pima.te$type, predicted_logistic_outcomes)
	table_logistic <- table(Pima.te$type, predicted_logistic_outcomes)

	error_rate_logistic <- sum(table_logistic[2:3])/sum(table_logistic)
	print(error_rate_logistic)

	## Basic tree ##
	basic_tree <- tree(type ~., data = Pima.tr)
	predict_basic_tree <- predict(basic_tree, newdata = Pima.te, type = "class")

	table(Pima.te$type, predict_basic_tree)
	table_basic_tree <- table(Pima.te$type, predict_basic_tree)

	error_rate_basic_tree <- sum(table_basic_tree[2:3])/sum(table_basic_tree)
	print(error_rate_basic_tree)

	## Pruned tree ##
	pruned_tree <- cv.tree(basic_tree,FUN=prune.misclass)
	print(pruned_tree)

	pruned_tree <- prune.misclass(basic_tree, k = 1.5)
	predict_pruned_tree <- predict(pruned_tree, newdata = Pima.te, type = "class")

	#NEXT STEP: Evaluate pruned tree performance in the test set


	## Random forest ##

	#STEP 1: Run Random Forest model

	#STEP 2: Use your model to predict for the test set

	#STEP 3: Evaluate model performance in the test set