Last active
November 2, 2022 09:31
-
-
Save ivopbernardo/fced9da45c6756073f5140ee5b1301e0 to your computer and use it in GitHub Desktop.
xgboostr.r
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Training an XGBoost in R - used in blog post: | |
# https://towardsdatascience.com/data-science-tutorials-training-an-xgboost-using-r-cf3c00b1425 | |
library(dplyr) | |
library(xgboost) | |
library(Metrics) | |
library(ggplot2) | |
# Load london bike csv | |
london_bike <- read.csv('./london_merged.csv') | |
# Using a function to split into train and test | |
train_test_split <- function(data, percentage) { | |
data_with_row_id <- data %>% | |
mutate(id = row_number()) | |
set.seed(1234) | |
training_data <- data_with_row_id %>% | |
sample_frac(percentage) | |
test_data <- anti_join( | |
data_with_row_id, | |
training_data, | |
by='id' | |
) | |
training_data$id <- NULL | |
test_data$id <- NULL | |
return (list(training_data, test_data)) | |
} | |
# Keeping 80% for the training set | |
training_data <- train_test_split( | |
london_bike, 0.8 | |
)[[1]] | |
test_data <- train_test_split( | |
london_bike, 0.8 | |
)[[2]] | |
# Subsetting only the features | |
X_train <- training_data[,c('t1','t2','hum', | |
'wind_speed','weather_code', | |
'is_holiday','is_weekend', | |
'season')] | |
X_test <- test_data[,c('t1','t2','hum', | |
'wind_speed','weather_code', | |
'is_holiday','is_weekend', | |
'season')] | |
# Defining target variable | |
y_train <- training_data$cnt | |
y_test <- test_data$cnt | |
# Fitting XGBoost Model | |
set.seed(1234) | |
xgb <- xgboost(data = as.matrix(X_train), | |
label = y_train, | |
nround = 10) | |
# Acessing hyperparameters | |
?xgboost | |
# Timing the execution | |
system.time( | |
xgboost(data = as.matrix(X_train), | |
label = y_train, | |
nround = 10)) | |
# Assessing performance | |
rmse( | |
y_test, | |
predict(xgb, as.matrix(X_test)) | |
) | |
# Adding a new hyperparameter | |
set.seed(1234) | |
xgb_ext <- xgboost(data = as.matrix(X_train), | |
label = y_train, | |
nround = 50, | |
max_depth=20) | |
# Plotting RMSE throughout iterations | |
ggplot( | |
data = xgb_ext$evaluation_log, | |
aes(x=iter, y=train_rmse) | |
) + geom_line(color='darkred') + ylab('RMSE') + xlab('Iteration Number') + theme_light() + geom_point() | |
# Assessing execution time | |
system.time(xgboost(data = as.matrix(X_train), | |
label = y_train, | |
nround = 50, | |
max_depth=20) | |
) | |
# Assessing performance | |
rmse( | |
y_test, | |
predict(xgb_ext, as.matrix(X_test)) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment