Skip to content

Instantly share code, notes, and snippets.

@YuriyGuts
Created February 3, 2016 13:11
Show Gist options
  • Save YuriyGuts/ebc6561b8530b0e59d8d to your computer and use it in GitHub Desktop.
Save YuriyGuts/ebc6561b8530b0e59d8d to your computer and use it in GitHub Desktop.
Predict the survival of RMS Titanic passengers using logistic regression.
# Predict the survival of RMS Titanic passengers using logistic regression.
# Based on Kaggle Titanic dataset: https://www.kaggle.com/c/titanic/data
#
# You might need to install Amelia and ROCR packages.
cleanData <- function(rawData) {
# Uncomment these two lines to visualize the missing data.
# library(Amelia)
# missmap(trainingData, main="Missing vs. observed values")
# Removing the columns which have no predictive value.
columnsToKeep <- c("Age", "Embarked", "Fare", "Parch", "SibSp", "Sex", "Pclass", "Survived")
data <- rawData[columnsToKeep]
# Replace missing age with the mean value.
data$Age[is.na(data$Age)] <- mean(data$Age, na.rm=TRUE)
# Remove passengers with unknown Embarked location.
data <- data[!is.na(data$Embarked),]
return(data)
}
# Read the raw CSV file and clean up invalid data.
rawData <- read.csv("train.csv", header=TRUE, na.strings=c(""))
cleanedData <- cleanData(rawData)
# Split the dataset into train and test subsets (80%:20%).
splitPoint <- as.integer(nrow(cleanedData) * 0.8)
trainingData <- cleanedData[1:splitPoint,]
testData <- cleanedData[(splitPoint + 1):nrow(cleanedData),]
# Train the logistic regression classifier.
model <- glm(Survived ~ ., family=binomial(link="logit"), data=trainingData)
# Output the summary analysis of the model.
summary(model)
anova(model, test="Chisq")
# Predict test data.
rawTestPrediction <- predict(model, newdata=testData, type="response")
# Convert the sigmoid values to hard 0 or 1 values.
testPrediction <- ifelse(rawTestPrediction > 0.5, 1, 0)
# Compute the classification error.
classificationError <- mean(as.vector(testPrediction) != testData$Survived)
sprintf('Accuracy: %.3f%%', (1 - classificationError) * 100)
# Plot the ROC curve.
library(ROCR)
modelROCCurve <- performance(prediction(rawTestPrediction, testData$Survived), measure="tpr", x.measure="fpr")
plot(modelROCCurve)
# Predict unknown data.
unknownData <- read.csv("predict.csv", header=TRUE, na.strings=c(""))
unknownPrediction <- predict(model, newdata=unknownData, type="response")
unknownPrediction <- ifelse(unknownPrediction > 0.5, 1, 0)
# Append predicted value to original dataset.
unknownData$Survived = unknownPrediction
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment