Skip to content

Instantly share code, notes, and snippets.

@yabyzq
Created November 22, 2016 13:43
Show Gist options
  • Save yabyzq/60c5e3c0e26da4f02ef25477ae0ac59d to your computer and use it in GitHub Desktop.
Save yabyzq/60c5e3c0e26da4f02ef25477ae0ac59d to your computer and use it in GitHub Desktop.
Xgboost
library(xgboost)
library(Matrix)
library(data.table)
library(vcd)
data(Arthritis)
df <- data.table(Arthritis, keep.rownames = F)
head(df[,AgeDiscret := as.factor(round(Age/10,0))])
head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])
df[,ID:=NULL]
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) #excluding target valriable, removing first column with all 1
head(sparse_matrix)
output_vector = df[,Improved] == "Marked"
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4,
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
#Gain, improvement in accuracy brought by feature, Cover Coverage, Frequency, appear in how many trees
head(importance)
#Split
importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
head(importanceClean,20)
#Plot
xgb.plot.importance(importance_matrix = importanceRaw)
#Chi-square
c2 <- chisq.test(df$Age, output_vector)
print(c2)
c2 <- chisq.test(df$AgeDiscret, output_vector)
print(c2)
#Avoid overfitting
max_depth, min_child_weight and gamma
#subsample, colsample_bytree
increase randomness
#reduce eta, increase rounds
#
scale_pos_weight + AUC
set parameter max_delta_step to 1
eta = 0.01 to 0.3
min_child_weight 1 to 5(underfitting)
max_depth = 3 to 10
gamma = 0 to 1 (prevent overfitting)
max_delta_step (0 to 1) used for unbalanced
subsample/colsample_bytree 0.5 to 1
scale_pos_weight help convergence
Tune eta and rounds
Tune max_depth and min_child_weight
Tune Gamma
Tune subsample and colsample_bytree
Tune regularisation
Reduce eta and increase rounds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment