rudeboybert · March 23, 2020 00:00
diff --git a/CV_for_CART.R b/CV_for_CART.R
 # Based on data from "House Prices: Advanced Regression Techniques" Kaggle Competition
 # https://www.kaggle.com/c/house-prices-advanced-regression-techniques
 # YouTube demo can be found here: 

 library(tidyverse)
 library(rpart)
 library(Metrics)

 # Reload house prices data
 train <- read_csv("https://rudeboybert.github.io/SDS293/static/train.csv")
 test <- read_csv("https://rudeboybert.github.io/SDS293/static/test.csv")

 # Set number of folds
 k <- 5

 # Randomly set k folds to training data
 train <- train %>% 
  sample_frac(size = 1) %>% 
  mutate(fold = rep(1:k, length = n())) %>% 
  arrange(fold)

 cp_values_grid <- seq(from = 0, to = 0.0015, len = 101)
 error_estimates <- rep(0, times = length(cp_values_grid))

 error_estimate_per_fold <- rep(0, k)

 for(j in 1:length(cp_values_grid )){
  
  current_cp_value <- cp_values_grid[j]
  
  for(i in 1:k){
    train_cv <- train %>% 
      filter(fold != i)
    test_cv <- train %>% 
      filter(fold == i)
    
    # Fit model:
    trained_model <- rpart(SalePrice ~ GrLivArea + HalfBath + YearBuilt, 
                           data = train_cv,
                           control = rpart.control(cp = current_cp_value))
    
    # Get predictions
    y_hat <- predict(trained_model, type="vector", newdata = test_cv)
    
    # Get error
    error_estimate_per_fold[i] <- rmsle(actual = test_cv$SalePrice, predicted = y_hat)
    
  }
  error_estimates[j] <- mean(error_estimate_per_fold)
 }

 blah <- tibble(
  cp_value = cp_values_grid,
  error_estimate = error_estimates
 )
 ggplot(blah, aes(x = cp_value, y = error_estimate)) +
  geom_point() +
  labs(x = "Complexity parameter", y = "Estimate of RMSLE")



 # Bonus: Use optimal complexity parameter value to make submissions on Kaggle
 # Since there are multiple cp values that yield the lowest estimated RMSLE, use
 # the smallest value since it yields the least complex tree.
 cp_star <- blah %>% 
  arrange(error_estimate, cp_value) %>% 
  slice(1) %>% 
  pull(cp_value)
  
 # Fit/train model on all training data
 trained_model_all <- rpart(SalePrice ~ GrLivArea + HalfBath + YearBuilt, 
                       data = train,
                       control = rpart.control(cp = cp_star))

 # Visualize this tree:
 plot(trained_model_all, margin = 0.25)
 text(trained_model_all, use.n = TRUE)
 title("Classification & Regression Tree")
 box()

 # Predict on test set
 test <- test %>% 
  mutate(SalePriceHat = predict(trained_model_all, type="vector", newdata = test))

 # Write predictions to csv following exact format required by Kaggle here
 # https://www.kaggle.com/c/house-prices-advanced-regression-techniques/submit
 test %>% 
  select(Id, SalePrice = SalePriceHat) %>% 
  write_csv("submission.csv")

 # This yields a RMSLE of 0.22065!
	# Based on data from "House Prices: Advanced Regression Techniques" Kaggle Competition
	# https://www.kaggle.com/c/house-prices-advanced-regression-techniques
	# YouTube demo can be found here:

	library(tidyverse)
	library(rpart)
	library(Metrics)

	# Reload house prices data
	train <- read_csv("https://rudeboybert.github.io/SDS293/static/train.csv")
	test <- read_csv("https://rudeboybert.github.io/SDS293/static/test.csv")

	# Set number of folds
	k <- 5

	# Randomly set k folds to training data
	train <- train %>%
	sample_frac(size = 1) %>%
	mutate(fold = rep(1:k, length = n())) %>%
	arrange(fold)

	cp_values_grid <- seq(from = 0, to = 0.0015, len = 101)
	error_estimates <- rep(0, times = length(cp_values_grid))

	error_estimate_per_fold <- rep(0, k)

	for(j in 1:length(cp_values_grid )){

	current_cp_value <- cp_values_grid[j]

	for(i in 1:k){
	train_cv <- train %>%
	filter(fold != i)
	test_cv <- train %>%
	filter(fold == i)

	# Fit model:
	trained_model <- rpart(SalePrice ~ GrLivArea + HalfBath + YearBuilt,
	data = train_cv,
	control = rpart.control(cp = current_cp_value))

	# Get predictions
	y_hat <- predict(trained_model, type="vector", newdata = test_cv)

	# Get error
	error_estimate_per_fold[i] <- rmsle(actual = test_cv$SalePrice, predicted = y_hat)

	}
	error_estimates[j] <- mean(error_estimate_per_fold)
	}

	blah <- tibble(
	cp_value = cp_values_grid,
	error_estimate = error_estimates
	)
	ggplot(blah, aes(x = cp_value, y = error_estimate)) +
	geom_point() +
	labs(x = "Complexity parameter", y = "Estimate of RMSLE")



	# Bonus: Use optimal complexity parameter value to make submissions on Kaggle
	# Since there are multiple cp values that yield the lowest estimated RMSLE, use
	# the smallest value since it yields the least complex tree.
	cp_star <- blah %>%
	arrange(error_estimate, cp_value) %>%
	slice(1) %>%
	pull(cp_value)

	# Fit/train model on all training data
	trained_model_all <- rpart(SalePrice ~ GrLivArea + HalfBath + YearBuilt,
	data = train,
	control = rpart.control(cp = cp_star))

	# Visualize this tree:
	plot(trained_model_all, margin = 0.25)
	text(trained_model_all, use.n = TRUE)
	title("Classification & Regression Tree")
	box()

	# Predict on test set
	test <- test %>%
	mutate(SalePriceHat = predict(trained_model_all, type="vector", newdata = test))

	# Write predictions to csv following exact format required by Kaggle here
	# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/submit
	test %>%
	select(Id, SalePrice = SalePriceHat) %>%
	write_csv("submission.csv")

	# This yields a RMSLE of 0.22065!
No results found