Last active
November 2, 2022 09:31
-
-
Save ivopbernardo/7ffdc7a32ea98d47357b15985913ce7c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Training a Random Forest in R - used in blog post: | |
# https://towardsdatascience.com/data-science-tutorials-training-a-random-forest-in-r-a883cc1bacd1 | |
library(dplyr) | |
library(randomForest) | |
library(ranger) | |
library(Metrics) | |
# Load london bike csv | |
london_bike <- read.csv('./london_merged.csv') | |
# Using a function to split into train and test | |
train_test_split <- function(data, percentage) { | |
data_with_row_id <- data %>% | |
mutate(id = row_number()) | |
set.seed(1234) | |
training_data <- data_with_row_id %>% | |
sample_frac(percentage) | |
test_data <- anti_join( | |
data_with_row_id, | |
training_data, | |
by='id' | |
) | |
training_data$id <- NULL | |
test_data$id <- NULL | |
return (list(training_data, test_data)) | |
} | |
# Keeping 80% for the training set | |
training_data <- train_test_split(london_bike, 0.8)[[1]] | |
test_data <- train_test_split(london_bike, 0.8)[[2]] | |
# Subsetting only target and features | |
training_data <- training_data[,c('t1','t2','hum', | |
'wind_speed','weather_code', | |
'is_holiday','is_weekend', | |
'season', 'cnt')] | |
test_data <- test_data[,c('t1','t2','hum', | |
'wind_speed','weather_code', | |
'is_holiday','is_weekend', | |
'season', 'cnt')] | |
# Fitting Random Forest | |
set.seed(1234) | |
rf <- randomForest(formula = cnt ~ ., | |
data = training_data, | |
ntree = 100) | |
# Timing the execution | |
system.time( | |
randomForest(cnt ~ ., data = training_data, | |
ntree = 100)) | |
# Adding a new hyperparameter | |
rf_2 <- randomForest(formula = cnt ~ ., | |
data = training_data, | |
ntree = 100, | |
nodesize = 10) | |
# Assessing performance | |
rmse(test_data$cnt, predict(rf, test_data)) | |
# Implementation with Ranger | |
set.seed(1234) | |
rf_ranger <- ranger( | |
formula = cnt ~ ., | |
data = training_data, | |
num.trees=100) | |
# Let's check the execution time | |
system.time(rf_ranger <- ranger( | |
formula = cnt ~ ., | |
data = training_data, | |
num.trees=100)) | |
# Adding hyperparameters | |
rf_ranger_2 <- ranger(formula = cnt ~ ., | |
data = training_data, | |
num.trees=100, | |
min.node.size = 10) | |
# Assessing performance | |
rmse( | |
test_data$cnt, | |
predict(rf_ranger, test_data)$predictions | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment