YuriyGuts · February 3, 2016 13:11
diff --git a/titanic-survival-lr.r b/titanic-survival-lr.r
 # Predict the survival of RMS Titanic passengers using logistic regression.
 # Based on Kaggle Titanic dataset: https://www.kaggle.com/c/titanic/data
 #
 # You might need to install Amelia and ROCR packages.


 cleanData <- function(rawData) {
    # Uncomment these two lines to visualize the missing data.
    # library(Amelia)
    # missmap(trainingData, main="Missing vs. observed values")
    
    # Removing the columns which have no predictive value.
    columnsToKeep <- c("Age", "Embarked", "Fare", "Parch", "SibSp", "Sex", "Pclass", "Survived")
    data <- rawData[columnsToKeep]
    
    # Replace missing age with the mean value.
    data$Age[is.na(data$Age)] <- mean(data$Age, na.rm=TRUE)
    
    # Remove passengers with unknown Embarked location.
    data <- data[!is.na(data$Embarked),]
    
    return(data)
 }

 # Read the raw CSV file and clean up invalid data.
 rawData <- read.csv("train.csv", header=TRUE, na.strings=c(""))
 cleanedData <- cleanData(rawData)

 # Split the dataset into train and test subsets (80%:20%).
 splitPoint <- as.integer(nrow(cleanedData) * 0.8)
 trainingData <- cleanedData[1:splitPoint,]
 testData <- cleanedData[(splitPoint + 1):nrow(cleanedData),]

 # Train the logistic regression classifier.
 model <- glm(Survived ~ ., family=binomial(link="logit"), data=trainingData)

 # Output the summary analysis of the model.
 summary(model)
 anova(model, test="Chisq")

 # Predict test data.
 rawTestPrediction <- predict(model, newdata=testData, type="response")

 # Convert the sigmoid values to hard 0 or 1 values.
 testPrediction <- ifelse(rawTestPrediction > 0.5, 1, 0)

 # Compute the classification error.
 classificationError <- mean(as.vector(testPrediction) != testData$Survived)
 sprintf('Accuracy: %.3f%%', (1 - classificationError) * 100)

 # Plot the ROC curve.
 library(ROCR)
 modelROCCurve <- performance(prediction(rawTestPrediction, testData$Survived), measure="tpr", x.measure="fpr")
 plot(modelROCCurve)

 # Predict unknown data.
 unknownData <- read.csv("predict.csv", header=TRUE, na.strings=c(""))
 unknownPrediction <- predict(model, newdata=unknownData, type="response")
 unknownPrediction <- ifelse(unknownPrediction > 0.5, 1, 0)

 # Append predicted value to original dataset.
 unknownData$Survived = unknownPrediction
	# Predict the survival of RMS Titanic passengers using logistic regression.
	# Based on Kaggle Titanic dataset: https://www.kaggle.com/c/titanic/data
	#
	# You might need to install Amelia and ROCR packages.


	cleanData <- function(rawData) {
	# Uncomment these two lines to visualize the missing data.
	# library(Amelia)
	# missmap(trainingData, main="Missing vs. observed values")

	# Removing the columns which have no predictive value.
	columnsToKeep <- c("Age", "Embarked", "Fare", "Parch", "SibSp", "Sex", "Pclass", "Survived")
	data <- rawData[columnsToKeep]

	# Replace missing age with the mean value.
	data$Age[is.na(data$Age)] <- mean(data$Age, na.rm=TRUE)

	# Remove passengers with unknown Embarked location.
	data <- data[!is.na(data$Embarked),]

	return(data)
	}

	# Read the raw CSV file and clean up invalid data.
	rawData <- read.csv("train.csv", header=TRUE, na.strings=c(""))
	cleanedData <- cleanData(rawData)

	# Split the dataset into train and test subsets (80%:20%).
	splitPoint <- as.integer(nrow(cleanedData) * 0.8)
	trainingData <- cleanedData[1:splitPoint,]
	testData <- cleanedData[(splitPoint + 1):nrow(cleanedData),]

	# Train the logistic regression classifier.
	model <- glm(Survived ~ ., family=binomial(link="logit"), data=trainingData)

	# Output the summary analysis of the model.
	summary(model)
	anova(model, test="Chisq")

	# Predict test data.
	rawTestPrediction <- predict(model, newdata=testData, type="response")

	# Convert the sigmoid values to hard 0 or 1 values.
	testPrediction <- ifelse(rawTestPrediction > 0.5, 1, 0)

	# Compute the classification error.
	classificationError <- mean(as.vector(testPrediction) != testData$Survived)
	sprintf('Accuracy: %.3f%%', (1 - classificationError) * 100)

	# Plot the ROC curve.
	library(ROCR)
	modelROCCurve <- performance(prediction(rawTestPrediction, testData$Survived), measure="tpr", x.measure="fpr")
	plot(modelROCCurve)

	# Predict unknown data.
	unknownData <- read.csv("predict.csv", header=TRUE, na.strings=c(""))
	unknownPrediction <- predict(model, newdata=unknownData, type="response")
	unknownPrediction <- ifelse(unknownPrediction > 0.5, 1, 0)

	# Append predicted value to original dataset.
	unknownData$Survived = unknownPrediction