yabyzq · November 22, 2016 13:43
diff --git a/Xgboost.R b/Xgboost.R
 library(xgboost)
 library(Matrix)
 library(data.table)
 library(vcd)

 data(Arthritis)
 df <- data.table(Arthritis, keep.rownames = F)

 head(df[,AgeDiscret := as.factor(round(Age/10,0))])

 head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])

 df[,ID:=NULL]

 sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) #excluding target valriable, removing first column with all 1
 head(sparse_matrix)

 output_vector = df[,Improved] == "Marked"

 bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4,
               eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")

 importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
 #Gain, improvement in accuracy brought by feature, Cover Coverage, Frequency, appear in how many trees
 head(importance)

 #Split
 importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
 importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
 head(importanceClean,20)

 #Plot
 xgb.plot.importance(importance_matrix = importanceRaw)

 #Chi-square
 c2 <- chisq.test(df$Age, output_vector)
 print(c2)
 c2 <- chisq.test(df$AgeDiscret, output_vector)
 print(c2)


 #Avoid overfitting 
 max_depth, min_child_weight and gamma
 #subsample, colsample_bytree
 increase randomness
 #reduce eta, increase rounds

 #
 scale_pos_weight + AUC

 set parameter max_delta_step to 1


 eta = 0.01 to 0.3
 min_child_weight 1 to 5(underfitting)
 max_depth = 3 to 10 
 gamma = 0 to 1 (prevent overfitting)
 max_delta_step (0 to 1) used for unbalanced
 subsample/colsample_bytree 0.5 to 1
 scale_pos_weight help convergence

 Tune eta and rounds
 Tune max_depth and min_child_weight
 Tune Gamma
 Tune subsample and colsample_bytree
 Tune regularisation
 Reduce eta and increase rounds
	library(xgboost)
	library(Matrix)
	library(data.table)
	library(vcd)

	data(Arthritis)
	df <- data.table(Arthritis, keep.rownames = F)

	head(df[,AgeDiscret := as.factor(round(Age/10,0))])

	head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])

	df[,ID:=NULL]

	sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) #excluding target valriable, removing first column with all 1
	head(sparse_matrix)

	output_vector = df[,Improved] == "Marked"

	bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4,
	eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")

	importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
	#Gain, improvement in accuracy brought by feature, Cover Coverage, Frequency, appear in how many trees
	head(importance)

	#Split
	importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
	importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
	head(importanceClean,20)

	#Plot
	xgb.plot.importance(importance_matrix = importanceRaw)

	#Chi-square
	c2 <- chisq.test(df$Age, output_vector)
	print(c2)
	c2 <- chisq.test(df$AgeDiscret, output_vector)
	print(c2)


	#Avoid overfitting
	max_depth, min_child_weight and gamma
	#subsample, colsample_bytree
	increase randomness
	#reduce eta, increase rounds

	#
	scale_pos_weight + AUC

	set parameter max_delta_step to 1


	eta = 0.01 to 0.3
	min_child_weight 1 to 5(underfitting)
	max_depth = 3 to 10
	gamma = 0 to 1 (prevent overfitting)
	max_delta_step (0 to 1) used for unbalanced
	subsample/colsample_bytree 0.5 to 1
	scale_pos_weight help convergence

	Tune eta and rounds
	Tune max_depth and min_child_weight
	Tune Gamma
	Tune subsample and colsample_bytree
	Tune regularisation
	Reduce eta and increase rounds