Skip to content

Instantly share code, notes, and snippets.

@primaryobjects
Created March 25, 2016 18:26
Show Gist options
  • Select an option

  • Save primaryobjects/b9b8a1afa94294023812 to your computer and use it in GitHub Desktop.

Select an option

Save primaryobjects/b9b8a1afa94294023812 to your computer and use it in GitHub Desktop.
Example using xgboost to model Santander data.
# https://www.kaggle.com/fsimond/santander-customer-satisfaction/santander-starter/run/172126/files
library(xgboost)
library(Matrix)
set.seed(1234)
train <- read.csv("train.csv")
test <- read.csv("test.csv")
##### Removing IDs
train$ID <- NULL
test.id <- test$ID
test$ID <- NULL
##### Extracting TARGET
train.y <- train$TARGET
train$TARGET <- NULL
##### 0 count per line
count0 <- function(x) {
return( sum(x == 0) )
}
train$n0 <- apply(train, 1, FUN=count0)
test$n0 <- apply(test, 1, FUN=count0)
##### Removing constant features
cat("\n## Removing the constants features.\n")
for (f in names(train)) {
if (length(unique(train[[f]])) == 1) {
cat(f, "is constant in train. We delete it.\n")
train[[f]] <- NULL
test[[f]] <- NULL
}
}
##### Removing identical features
features_pair <- combn(names(train), 2, simplify = F)
toRemove <- c()
for(pair in features_pair) {
f1 <- pair[1]
f2 <- pair[2]
if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) {
if (all(train[[f1]] == train[[f2]])) {
cat(f1, "and", f2, "are equals.\n")
toRemove <- c(toRemove, f2)
}
}
}
feature.names <- setdiff(names(train), toRemove)
train <- train[, feature.names]
test <- test[, feature.names]
train$TARGET <- train.y
train <- sparse.model.matrix(TARGET ~ ., data = train)
dtrain <- xgb.DMatrix(data=train, label=train.y)
watchlist <- list(train=dtrain)
param <- list( objective = "binary:logistic",
booster = "gbtree",
eval_metric = "auc",
eta = 0.02,
max_depth = 8,
subsample = 0.9,
colsample_bytree = 0.85
)
clf <- xgb.train( params = param,
data = dtrain,
nrounds = 350,
verbose = 1,
watchlist = watchlist,
maximize = TRUE
)
test$TARGET <- -1
test <- sparse.model.matrix(TARGET ~ ., data = test)
preds <- predict(clf, test)
submission <- data.frame(ID=test.id, TARGET=preds)
cat("saving the submission file\n")
write.csv(submission, "submission.csv", row.names = F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment