pdyraga · November 25, 2016 09:31
diff --git a/titanic.r b/titanic.r
 # VARIABLE DESCRIPTIONS:
 #  survival        Survival
 #(0 = No; 1 = Yes)
 #pclass          Passenger Class
 #(1 = 1st; 2 = 2nd; 3 = 3rd)
 #name            Name
 #sex             Sex
 #age             Age
 #sibsp           Number of Siblings/Spouses Aboard
 #parch           Number of Parents/Children Aboard
 #ticket          Ticket Number
 #fare            Passenger Fare
 #cabin           Cabin
 #embarked        Port of Embarkation
 #(C = Cherbourg; Q = Queenstown; S = Southampton)

 install.packages('rattle')
 install.packages('RGtk2')
 install.packages('rpart.plot')
 install.packages('RColorBrewer')
 install.packages('randomForest')
 install.packages('party')

 # Set working directory and import datafiles
 setwd("~/git/hackaton-data-science")

 library(readr)
 train <- read_csv("train.csv")
 View(train)

 # Sum up number of those who survived and those who didn't
 table(train$Survived)

 # Evaluate % of survivors per sex and age (child / adult)
 train$Child <- 0
 train$Child[train$Age < 18] <- 1
 aggregate(Survived ~ Child + Sex, data=train, FUN=function(x) {sum(x)/length(x)})
 # ^ we can do it naively here - each adult male dies, each adult female survives


 # instead of looking up patterns manually (e.g. gender, age and how they relate to death rate),
 # let's use decision trees. Greedy, scans all of the variables for the best one to split on. 
 # The way it measures this is to make the split on the variable that results in the most pure 
 # nodes below
 library(rattle)
 library(rpart.plot)
 library(RColorBrewer)

 fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
             data=train,
             method="class")

 fancyRpartPlot(fit)

 train$Prediction <- predict(fit, train, type = "class")
 mean(train$Prediction == train$Survived) # [1] 0.8395062

 # ----------

 # Let's do some feature engineering and take into account data that we didn't look at before:
 # title (Mr, Ms, Mrs etc.), family size and family ID (maybe one family was more likely to
 # survive than other)

 # Let's use test data to make our data set for building decisions bigger
 test <- read_csv("test.csv")
 test$Survived <- NA
 test$Prediction <- NA
 test$Child <- 0
 test$Child[test$Age < 18] <- 1

 View(test)
 combi <- rbind(train, test)

 # Extract titles...
 combi$Name <- as.character(combi$Name)
 combi$Title <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})
 combi$Title <- sub(' ', '', combi$Title)
 combi$Title[combi$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
 combi$Title[combi$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
 combi$Title[combi$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'
 combi$Title <- factor(combi$Title)

 table(combi$Title) # YES, YES, YES!

 # Extract family size...
 combi$FamilySize <- combi$SibSp + combi$Parch + 1

 # Extract family ID (we'll use family size + surname composite key)
 combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})
 combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep="")
 combi$FamilyID[combi$FamilySize <= 2] <- 'Small'
 famIDs <- data.frame(table(combi$FamilyID))
 famIDs <- famIDs[famIDs$Freq <= 2,]
 combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- 'Small'
 combi$FamilyID <- factor(combi$FamilyID)
 train <- combi[1:891,]
 test <- combi[892:1309,]

 fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID,
             data=train, 
             method="class")

 train$Prediction <- predict(fit, train, type = "class")
 mean(train$Prediction == train$Survived) # [1] 0.8552189

 # NICE, WE ARE GETTING BETTER AND BETTER!

 Prediction <- predict(fit, test, type = "class")
 submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
 write.csv(submit, file = "feature_engineering.csv", row.names = FALSE) # 0.79426

 # Mean looks nice, but still, tree can be overfitted. Let's use random forest now.
 # Take a large collection of individually imperfect models, and their one-off mistakes 
 # are probably not going to be made by the rest of them. If we average the results of 
 # all these models, we can sometimes find a superior model from their combination than 
 # any of the individual parts.

 # rpart has a great advantage in that it can use surrogate variables when it encounters 
 # NA value. Unfortunately, it doesn't work for random forest. In our dataset there 
 # are a lot of age values missing and we need to fix it first.

 # And lets use decision tree to do it!
 Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize,
                data=combi[!is.na(combi$Age),], 
                method="anova")
 combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])

 # Also, Embarked and Fare are missing

 # just two Embarked missing so let's just hardcode them...
 combi$Embarked[c(62,830)] = "S"
 combi$Embarked <- factor(combi$Embarked)
 # only one Fare is missing, let's replace it with a median fare
 combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)

 # Another problem: Random Forests in R can only digest factors with up to 32 levels. 
 # Our FamilyID variable had almost double that. Let's manually reduce the number of 
 # levels to keep it under the threshold.
 combi$FamilyID2 <- combi$FamilyID
 combi$FamilyID2 <- as.character(combi$FamilyID2)
 combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'
 combi$FamilyID2 <- factor(combi$FamilyID2)

 # Let's build random forest now!!!
 library(randomForest)
 set.seed(415)
 fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare +
                      Embarked + Title + FamilySize + FamilyID2,
                    data=train, 
                    importance=TRUE, 
                    ntree=2000)

 train$Prediction <- predict(fit, train, type = "class")
 mean(train$Prediction == train$Survived) # [1] 0.8552189

 # :( NO IMPROVEMENT HERE...

 Prediction <- predict(fit, test, type = "class")
 submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
 write.csv(submit, file = "random_forest.csv", row.names = FALSE)

 # Let's try a forest of conditional inference trees. They make their decisions in slightly
 # different ways, using a statistical test rather than a purity measure, but the basic
 # construction of each tree is fairly similar.
 library (party)

 sapply(train, class)

 set.seed(415)

 train$Sex <- factor(train$Sex)
 train$Embarked <- factor(train$Embarked)

 fit <- cforest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare +
                 Embarked + Title + FamilySize + FamilyID,
               data = train,
               controls = cforest_unbiased(ntree=2000, mtry=3))

 train$Prediction <- predict(fit, train, type = "class")
 mean(train$Prediction == train$Survived) # [1] 0.8552189

 Prediction <- predict(fit, test, OOB=TRUE, type = "response")
 submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
 write.csv(submit, file = "conditional_inference_trees.csv", row.names = FALSE)

 # RESULTS on test data (kaggle)
 # feature_engineering 0.79426
 # decision_trees 0.79426 // no improvement here :(
 # conditional_inference_trees 0.80861
	# VARIABLE DESCRIPTIONS:
	# survival Survival
	#(0 = No; 1 = Yes)
	#pclass Passenger Class
	#(1 = 1st; 2 = 2nd; 3 = 3rd)
	#name Name
	#sex Sex
	#age Age
	#sibsp Number of Siblings/Spouses Aboard
	#parch Number of Parents/Children Aboard
	#ticket Ticket Number
	#fare Passenger Fare
	#cabin Cabin
	#embarked Port of Embarkation
	#(C = Cherbourg; Q = Queenstown; S = Southampton)

	install.packages('rattle')
	install.packages('RGtk2')
	install.packages('rpart.plot')
	install.packages('RColorBrewer')
	install.packages('randomForest')
	install.packages('party')

	# Set working directory and import datafiles
	setwd("~/git/hackaton-data-science")

	library(readr)
	train <- read_csv("train.csv")
	View(train)

	# Sum up number of those who survived and those who didn't
	table(train$Survived)

	# Evaluate % of survivors per sex and age (child / adult)
	train$Child <- 0
	train$Child[train$Age < 18] <- 1
	aggregate(Survived ~ Child + Sex, data=train, FUN=function(x) {sum(x)/length(x)})
	# ^ we can do it naively here - each adult male dies, each adult female survives


	# instead of looking up patterns manually (e.g. gender, age and how they relate to death rate),
	# let's use decision trees. Greedy, scans all of the variables for the best one to split on.
	# The way it measures this is to make the split on the variable that results in the most pure
	# nodes below
	library(rattle)
	library(rpart.plot)
	library(RColorBrewer)

	fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
	data=train,
	method="class")

	fancyRpartPlot(fit)

	train$Prediction <- predict(fit, train, type = "class")
	mean(train$Prediction == train$Survived) # [1] 0.8395062

	# ----------

	# Let's do some feature engineering and take into account data that we didn't look at before:
	# title (Mr, Ms, Mrs etc.), family size and family ID (maybe one family was more likely to
	# survive than other)

	# Let's use test data to make our data set for building decisions bigger
	test <- read_csv("test.csv")
	test$Survived <- NA
	test$Prediction <- NA
	test$Child <- 0
	test$Child[test$Age < 18] <- 1

	View(test)
	combi <- rbind(train, test)

	# Extract titles...
	combi$Name <- as.character(combi$Name)
	combi$Title <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})
	combi$Title <- sub(' ', '', combi$Title)
	combi$Title[combi$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
	combi$Title[combi$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
	combi$Title[combi$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'
	combi$Title <- factor(combi$Title)

	table(combi$Title) # YES, YES, YES!

	# Extract family size...
	combi$FamilySize <- combi$SibSp + combi$Parch + 1

	# Extract family ID (we'll use family size + surname composite key)
	combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})
	combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep="")
	combi$FamilyID[combi$FamilySize <= 2] <- 'Small'
	famIDs <- data.frame(table(combi$FamilyID))
	famIDs <- famIDs[famIDs$Freq <= 2,]
	combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- 'Small'
	combi$FamilyID <- factor(combi$FamilyID)
	train <- combi[1:891,]
	test <- combi[892:1309,]

	fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID,
	data=train,
	method="class")

	train$Prediction <- predict(fit, train, type = "class")
	mean(train$Prediction == train$Survived) # [1] 0.8552189

	# NICE, WE ARE GETTING BETTER AND BETTER!

	Prediction <- predict(fit, test, type = "class")
	submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
	write.csv(submit, file = "feature_engineering.csv", row.names = FALSE) # 0.79426

	# Mean looks nice, but still, tree can be overfitted. Let's use random forest now.
	# Take a large collection of individually imperfect models, and their one-off mistakes
	# are probably not going to be made by the rest of them. If we average the results of
	# all these models, we can sometimes find a superior model from their combination than
	# any of the individual parts.

	# rpart has a great advantage in that it can use surrogate variables when it encounters
	# NA value. Unfortunately, it doesn't work for random forest. In our dataset there
	# are a lot of age values missing and we need to fix it first.

	# And lets use decision tree to do it!
	Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize,
	data=combi[!is.na(combi$Age),],
	method="anova")
	combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])

	# Also, Embarked and Fare are missing

	# just two Embarked missing so let's just hardcode them...
	combi$Embarked[c(62,830)] = "S"
	combi$Embarked <- factor(combi$Embarked)
	# only one Fare is missing, let's replace it with a median fare
	combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)

	# Another problem: Random Forests in R can only digest factors with up to 32 levels.
	# Our FamilyID variable had almost double that. Let's manually reduce the number of
	# levels to keep it under the threshold.
	combi$FamilyID2 <- combi$FamilyID
	combi$FamilyID2 <- as.character(combi$FamilyID2)
	combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'
	combi$FamilyID2 <- factor(combi$FamilyID2)

	# Let's build random forest now!!!
	library(randomForest)
	set.seed(415)
	fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare +
	Embarked + Title + FamilySize + FamilyID2,
	data=train,
	importance=TRUE,
	ntree=2000)

	train$Prediction <- predict(fit, train, type = "class")
	mean(train$Prediction == train$Survived) # [1] 0.8552189

	# :( NO IMPROVEMENT HERE...

	Prediction <- predict(fit, test, type = "class")
	submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
	write.csv(submit, file = "random_forest.csv", row.names = FALSE)

	# Let's try a forest of conditional inference trees. They make their decisions in slightly
	# different ways, using a statistical test rather than a purity measure, but the basic
	# construction of each tree is fairly similar.
	library (party)

	sapply(train, class)

	set.seed(415)

	train$Sex <- factor(train$Sex)
	train$Embarked <- factor(train$Embarked)

	fit <- cforest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare +
	Embarked + Title + FamilySize + FamilyID,
	data = train,
	controls = cforest_unbiased(ntree=2000, mtry=3))

	train$Prediction <- predict(fit, train, type = "class")
	mean(train$Prediction == train$Survived) # [1] 0.8552189

	Prediction <- predict(fit, test, OOB=TRUE, type = "response")
	submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
	write.csv(submit, file = "conditional_inference_trees.csv", row.names = FALSE)

	# RESULTS on test data (kaggle)
	# feature_engineering 0.79426
	# decision_trees 0.79426 // no improvement here :(
	# conditional_inference_trees 0.80861