Created
March 25, 2016 18:26
-
-
Save primaryobjects/b9b8a1afa94294023812 to your computer and use it in GitHub Desktop.
Example using xgboost to model Santander data.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # https://www.kaggle.com/fsimond/santander-customer-satisfaction/santander-starter/run/172126/files | |
| library(xgboost) | |
| library(Matrix) | |
| set.seed(1234) | |
| train <- read.csv("train.csv") | |
| test <- read.csv("test.csv") | |
| ##### Removing IDs | |
| train$ID <- NULL | |
| test.id <- test$ID | |
| test$ID <- NULL | |
| ##### Extracting TARGET | |
| train.y <- train$TARGET | |
| train$TARGET <- NULL | |
| ##### 0 count per line | |
| count0 <- function(x) { | |
| return( sum(x == 0) ) | |
| } | |
| train$n0 <- apply(train, 1, FUN=count0) | |
| test$n0 <- apply(test, 1, FUN=count0) | |
| ##### Removing constant features | |
| cat("\n## Removing the constants features.\n") | |
| for (f in names(train)) { | |
| if (length(unique(train[[f]])) == 1) { | |
| cat(f, "is constant in train. We delete it.\n") | |
| train[[f]] <- NULL | |
| test[[f]] <- NULL | |
| } | |
| } | |
| ##### Removing identical features | |
| features_pair <- combn(names(train), 2, simplify = F) | |
| toRemove <- c() | |
| for(pair in features_pair) { | |
| f1 <- pair[1] | |
| f2 <- pair[2] | |
| if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) { | |
| if (all(train[[f1]] == train[[f2]])) { | |
| cat(f1, "and", f2, "are equals.\n") | |
| toRemove <- c(toRemove, f2) | |
| } | |
| } | |
| } | |
| feature.names <- setdiff(names(train), toRemove) | |
| train <- train[, feature.names] | |
| test <- test[, feature.names] | |
| train$TARGET <- train.y | |
| train <- sparse.model.matrix(TARGET ~ ., data = train) | |
| dtrain <- xgb.DMatrix(data=train, label=train.y) | |
| watchlist <- list(train=dtrain) | |
| param <- list( objective = "binary:logistic", | |
| booster = "gbtree", | |
| eval_metric = "auc", | |
| eta = 0.02, | |
| max_depth = 8, | |
| subsample = 0.9, | |
| colsample_bytree = 0.85 | |
| ) | |
| clf <- xgb.train( params = param, | |
| data = dtrain, | |
| nrounds = 350, | |
| verbose = 1, | |
| watchlist = watchlist, | |
| maximize = TRUE | |
| ) | |
| test$TARGET <- -1 | |
| test <- sparse.model.matrix(TARGET ~ ., data = test) | |
| preds <- predict(clf, test) | |
| submission <- data.frame(ID=test.id, TARGET=preds) | |
| cat("saving the submission file\n") | |
| write.csv(submission, "submission.csv", row.names = F) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment