Created
November 25, 2016 09:31
-
-
Save pdyraga/4bd72c43532b2ec2224a5e54ae85716b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# VARIABLE DESCRIPTIONS: | |
# survival Survival | |
#(0 = No; 1 = Yes) | |
#pclass Passenger Class | |
#(1 = 1st; 2 = 2nd; 3 = 3rd) | |
#name Name | |
#sex Sex | |
#age Age | |
#sibsp Number of Siblings/Spouses Aboard | |
#parch Number of Parents/Children Aboard | |
#ticket Ticket Number | |
#fare Passenger Fare | |
#cabin Cabin | |
#embarked Port of Embarkation | |
#(C = Cherbourg; Q = Queenstown; S = Southampton) | |
install.packages('rattle') | |
install.packages('RGtk2') | |
install.packages('rpart.plot') | |
install.packages('RColorBrewer') | |
install.packages('randomForest') | |
install.packages('party') | |
# Set working directory and import datafiles | |
setwd("~/git/hackaton-data-science") | |
library(readr) | |
train <- read_csv("train.csv") | |
View(train) | |
# Sum up number of those who survived and those who didn't | |
table(train$Survived) | |
# Evaluate % of survivors per sex and age (child / adult) | |
train$Child <- 0 | |
train$Child[train$Age < 18] <- 1 | |
aggregate(Survived ~ Child + Sex, data=train, FUN=function(x) {sum(x)/length(x)}) | |
# ^ we can do it naively here - each adult male dies, each adult female survives | |
# instead of looking up patterns manually (e.g. gender, age and how they relate to death rate), | |
# let's use decision trees. Greedy, scans all of the variables for the best one to split on. | |
# The way it measures this is to make the split on the variable that results in the most pure | |
# nodes below | |
library(rattle) | |
library(rpart.plot) | |
library(RColorBrewer) | |
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, | |
data=train, | |
method="class") | |
fancyRpartPlot(fit) | |
train$Prediction <- predict(fit, train, type = "class") | |
mean(train$Prediction == train$Survived) # [1] 0.8395062 | |
# ---------- | |
# Let's do some feature engineering and take into account data that we didn't look at before: | |
# title (Mr, Ms, Mrs etc.), family size and family ID (maybe one family was more likely to | |
# survive than other) | |
# Let's use test data to make our data set for building decisions bigger | |
test <- read_csv("test.csv") | |
test$Survived <- NA | |
test$Prediction <- NA | |
test$Child <- 0 | |
test$Child[test$Age < 18] <- 1 | |
View(test) | |
combi <- rbind(train, test) | |
# Extract titles... | |
combi$Name <- as.character(combi$Name) | |
combi$Title <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]}) | |
combi$Title <- sub(' ', '', combi$Title) | |
combi$Title[combi$Title %in% c('Mme', 'Mlle')] <- 'Mlle' | |
combi$Title[combi$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir' | |
combi$Title[combi$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady' | |
combi$Title <- factor(combi$Title) | |
table(combi$Title) # YES, YES, YES! | |
# Extract family size... | |
combi$FamilySize <- combi$SibSp + combi$Parch + 1 | |
# Extract family ID (we'll use family size + surname composite key) | |
combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]}) | |
combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep="") | |
combi$FamilyID[combi$FamilySize <= 2] <- 'Small' | |
famIDs <- data.frame(table(combi$FamilyID)) | |
famIDs <- famIDs[famIDs$Freq <= 2,] | |
combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- 'Small' | |
combi$FamilyID <- factor(combi$FamilyID) | |
train <- combi[1:891,] | |
test <- combi[892:1309,] | |
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID, | |
data=train, | |
method="class") | |
train$Prediction <- predict(fit, train, type = "class") | |
mean(train$Prediction == train$Survived) # [1] 0.8552189 | |
# NICE, WE ARE GETTING BETTER AND BETTER! | |
Prediction <- predict(fit, test, type = "class") | |
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction) | |
write.csv(submit, file = "feature_engineering.csv", row.names = FALSE) # 0.79426 | |
# Mean looks nice, but still, tree can be overfitted. Let's use random forest now. | |
# Take a large collection of individually imperfect models, and their one-off mistakes | |
# are probably not going to be made by the rest of them. If we average the results of | |
# all these models, we can sometimes find a superior model from their combination than | |
# any of the individual parts. | |
# rpart has a great advantage in that it can use surrogate variables when it encounters | |
# NA value. Unfortunately, it doesn't work for random forest. In our dataset there | |
# are a lot of age values missing and we need to fix it first. | |
# And lets use decision tree to do it! | |
Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize, | |
data=combi[!is.na(combi$Age),], | |
method="anova") | |
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),]) | |
# Also, Embarked and Fare are missing | |
# just two Embarked missing so let's just hardcode them... | |
combi$Embarked[c(62,830)] = "S" | |
combi$Embarked <- factor(combi$Embarked) | |
# only one Fare is missing, let's replace it with a median fare | |
combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE) | |
# Another problem: Random Forests in R can only digest factors with up to 32 levels. | |
# Our FamilyID variable had almost double that. Let's manually reduce the number of | |
# levels to keep it under the threshold. | |
combi$FamilyID2 <- combi$FamilyID | |
combi$FamilyID2 <- as.character(combi$FamilyID2) | |
combi$FamilyID2[combi$FamilySize <= 3] <- 'Small' | |
combi$FamilyID2 <- factor(combi$FamilyID2) | |
# Let's build random forest now!!! | |
library(randomForest) | |
set.seed(415) | |
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + | |
Embarked + Title + FamilySize + FamilyID2, | |
data=train, | |
importance=TRUE, | |
ntree=2000) | |
train$Prediction <- predict(fit, train, type = "class") | |
mean(train$Prediction == train$Survived) # [1] 0.8552189 | |
# :( NO IMPROVEMENT HERE... | |
Prediction <- predict(fit, test, type = "class") | |
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction) | |
write.csv(submit, file = "random_forest.csv", row.names = FALSE) | |
# Let's try a forest of conditional inference trees. They make their decisions in slightly | |
# different ways, using a statistical test rather than a purity measure, but the basic | |
# construction of each tree is fairly similar. | |
library (party) | |
sapply(train, class) | |
set.seed(415) | |
train$Sex <- factor(train$Sex) | |
train$Embarked <- factor(train$Embarked) | |
fit <- cforest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + | |
Embarked + Title + FamilySize + FamilyID, | |
data = train, | |
controls = cforest_unbiased(ntree=2000, mtry=3)) | |
train$Prediction <- predict(fit, train, type = "class") | |
mean(train$Prediction == train$Survived) # [1] 0.8552189 | |
Prediction <- predict(fit, test, OOB=TRUE, type = "response") | |
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction) | |
write.csv(submit, file = "conditional_inference_trees.csv", row.names = FALSE) | |
# RESULTS on test data (kaggle) | |
# feature_engineering 0.79426 | |
# decision_trees 0.79426 // no improvement here :( | |
# conditional_inference_trees 0.80861 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment