Created
February 4, 2022 18:18
-
-
Save ivopbernardo/a1e3a04676c6d91dcd7f46d6661bbe54 to your computer and use it in GitHub Desktop.
Random Forests vs. Decision Trees
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Don't forget to download the train.csv file | |
# to make this gist work. | |
# Download it at: https://www.kaggle.com/c/titanic/data?select=train.csv | |
# You also need to install ROCR and rpart libraries | |
# Reading the titanic train dataset | |
titanic <- read.csv('./train.csv') | |
# Obtaining the number of rows for training (70%) | |
size <- ceiling(0.7*nrow(titanic)) | |
# Use an indexer to perform train and test split | |
set.seed(999) | |
train_index <- sample( | |
seq_len(nrow(titanic)), size = size | |
) | |
train_df <- titanic[train_index, ] | |
test_df <- titanic[-train_index, ] | |
# Loading rpart | |
library(rpart) | |
# Training decision tree 1 | |
set.seed(9990) | |
oak_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass, | |
data = sample_n(train_df, 600), | |
method = 'class', | |
control = list(maxdepth = 2, | |
minsplit=30)) | |
# Training Decision Tree 2 | |
set.seed(9991) | |
pine_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass, | |
data = sample_n(train_df, 600), | |
method = 'class', | |
control = list(maxdepth = 3, | |
minsplit=3, | |
minbucket=4, | |
cp=0.01)) | |
# Training Decision Tree 3 | |
set.seed(9992) | |
elm_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass, | |
data = sample_n(train_df, 600), | |
method = 'class', | |
control = list(maxdepth = 2, | |
minsplit=2, | |
minbucket=4, | |
cp=0.01)) | |
# Building function for AUC | |
library(ROCR) | |
obtainauc <- function(model) { | |
predictions <- predict(model, test_df)[,2] | |
pred <- prediction(predictions, test_df$Survived) | |
perf <- performance(pred, measure = 'auc') | |
return ([email protected][[1]]) | |
} | |
# Building ensemble | |
ensemble <- ( | |
predict(oak_tree, test_df)[,2] | |
+ | |
predict(pine_tree, test_df)[,2] | |
+ | |
predict(elm_tree, test_df)[,2] | |
)/3 | |
# Ensemble Performance | |
prediction <- prediction(ensemble, test_df$Survived) | |
perf <- performance(prediction, measure = 'auc') | |
performance_ensemble <- [email protected][[1]] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment