Created
November 22, 2016 13:43
-
-
Save yabyzq/60c5e3c0e26da4f02ef25477ae0ac59d to your computer and use it in GitHub Desktop.
Xgboost
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(xgboost) | |
library(Matrix) | |
library(data.table) | |
library(vcd) | |
data(Arthritis) | |
df <- data.table(Arthritis, keep.rownames = F) | |
head(df[,AgeDiscret := as.factor(round(Age/10,0))]) | |
head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]) | |
df[,ID:=NULL] | |
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) #excluding target valriable, removing first column with all 1 | |
head(sparse_matrix) | |
output_vector = df[,Improved] == "Marked" | |
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4, | |
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic") | |
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst) | |
#Gain, improvement in accuracy brought by feature, Cover Coverage, Frequency, appear in how many trees | |
head(importance) | |
#Split | |
importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector) | |
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)] | |
head(importanceClean,20) | |
#Plot | |
xgb.plot.importance(importance_matrix = importanceRaw) | |
#Chi-square | |
c2 <- chisq.test(df$Age, output_vector) | |
print(c2) | |
c2 <- chisq.test(df$AgeDiscret, output_vector) | |
print(c2) | |
#Avoid overfitting | |
max_depth, min_child_weight and gamma | |
#subsample, colsample_bytree | |
increase randomness | |
#reduce eta, increase rounds | |
# | |
scale_pos_weight + AUC | |
set parameter max_delta_step to 1 | |
eta = 0.01 to 0.3 | |
min_child_weight 1 to 5(underfitting) | |
max_depth = 3 to 10 | |
gamma = 0 to 1 (prevent overfitting) | |
max_delta_step (0 to 1) used for unbalanced | |
subsample/colsample_bytree 0.5 to 1 | |
scale_pos_weight help convergence | |
Tune eta and rounds | |
Tune max_depth and min_child_weight | |
Tune Gamma | |
Tune subsample and colsample_bytree | |
Tune regularisation | |
Reduce eta and increase rounds |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment