Created
February 3, 2016 13:11
-
-
Save YuriyGuts/ebc6561b8530b0e59d8d to your computer and use it in GitHub Desktop.
Predict the survival of RMS Titanic passengers using logistic regression.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Predict the survival of RMS Titanic passengers using logistic regression. | |
# Based on Kaggle Titanic dataset: https://www.kaggle.com/c/titanic/data | |
# | |
# You might need to install Amelia and ROCR packages. | |
cleanData <- function(rawData) { | |
# Uncomment these two lines to visualize the missing data. | |
# library(Amelia) | |
# missmap(trainingData, main="Missing vs. observed values") | |
# Removing the columns which have no predictive value. | |
columnsToKeep <- c("Age", "Embarked", "Fare", "Parch", "SibSp", "Sex", "Pclass", "Survived") | |
data <- rawData[columnsToKeep] | |
# Replace missing age with the mean value. | |
data$Age[is.na(data$Age)] <- mean(data$Age, na.rm=TRUE) | |
# Remove passengers with unknown Embarked location. | |
data <- data[!is.na(data$Embarked),] | |
return(data) | |
} | |
# Read the raw CSV file and clean up invalid data. | |
rawData <- read.csv("train.csv", header=TRUE, na.strings=c("")) | |
cleanedData <- cleanData(rawData) | |
# Split the dataset into train and test subsets (80%:20%). | |
splitPoint <- as.integer(nrow(cleanedData) * 0.8) | |
trainingData <- cleanedData[1:splitPoint,] | |
testData <- cleanedData[(splitPoint + 1):nrow(cleanedData),] | |
# Train the logistic regression classifier. | |
model <- glm(Survived ~ ., family=binomial(link="logit"), data=trainingData) | |
# Output the summary analysis of the model. | |
summary(model) | |
anova(model, test="Chisq") | |
# Predict test data. | |
rawTestPrediction <- predict(model, newdata=testData, type="response") | |
# Convert the sigmoid values to hard 0 or 1 values. | |
testPrediction <- ifelse(rawTestPrediction > 0.5, 1, 0) | |
# Compute the classification error. | |
classificationError <- mean(as.vector(testPrediction) != testData$Survived) | |
sprintf('Accuracy: %.3f%%', (1 - classificationError) * 100) | |
# Plot the ROC curve. | |
library(ROCR) | |
modelROCCurve <- performance(prediction(rawTestPrediction, testData$Survived), measure="tpr", x.measure="fpr") | |
plot(modelROCCurve) | |
# Predict unknown data. | |
unknownData <- read.csv("predict.csv", header=TRUE, na.strings=c("")) | |
unknownPrediction <- predict(model, newdata=unknownData, type="response") | |
unknownPrediction <- ifelse(unknownPrediction > 0.5, 1, 0) | |
# Append predicted value to original dataset. | |
unknownData$Survived = unknownPrediction |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment