Last active
December 12, 2015 03:38
-
-
Save raleighlinda/4708052 to your computer and use it in GitHub Desktop.
R logistic regression prediction for Titanic
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#File created 1/31/13 | |
#contains R code to | |
#-read in Kaggle Competition Titanic Data csv file | |
#-create a simple logistic regression model | |
#-make predictions on training and test data | |
#-write out test predictions to csv file | |
# | |
#Replace the <your path here> with the full path to your copy of train and test csv files. | |
################################################################################### | |
#create a Kaggle account http://www.kaggle.com/account/register | |
#read and agree to the rules if you choose to continue | |
#enter the Kaggle Titantic Competition http://www.kaggle.com/c/titanic-gettingStarted | |
#download train.csv and test.csv | |
#obtain-download R from http://www.r-project.org/ | |
#you will have to choose a ‘mirror’ or site – usually a university or research site | |
#read the training data into a dataframe called train | |
train<- read.table(“C:/Users/<your path here>/train.csv”, | |
header = TRUE, sep = “,”) | |
#set the pclass, passengers pseudoclass, to be ordered categorical | |
train$pclass <-factor(train$pclass,levels = c(3, 2, 1), ordered = TRUE) | |
#create a truth vector of survival results from training | |
S = train$survived == 1 | |
#read the test data into a dataframe named test | |
test<- read.table(“C:/Users/<your path here>/test.csv”, | |
header = TRUE, sep = “,”) | |
#pclass is categorical for test data also | |
test$pclass <-factor(test$pclass,levels = c(3, 2, 1), ordered = TRUE) | |
#create a super simple logistic regression model with the training data | |
#predicting survival based on passenger class and sex | |
logistic.model <- glm(survived ~ pclass + sex, family = binomial(), data=train) | |
#generate predictions for training data using the predict method of the logistic model | |
training_predictions <- predict(logistic.model, type = “response”) | |
#compute training error use an outcome cutoff at 0.5 | |
training_error <-sum((training_predictions >= 0.5) != S)/nrow(train) | |
training_error | |
1-training_error | |
#training error for predictions in {0,1} | |
test_predictions = predict(logistic.model, test, type = “response”) | |
#using a probability cutoff of 0.5 for outcome of survived, default missing to deceased | |
test_predictions[test_predictions >=0.5] <- 1 | |
test_predictions[ test_predictions != 1] <- 0 | |
test_predictions[is.na(test_predictions)] <- 0 | |
#write out the test_predictions to a comma separated value, csv, file | |
write.table(test_predictions, “C:/Users/<your path here>/predictions.csv”,col.names = F,row.names=F,quote=FALSE) | |
#submit your predictions.csv file to Kaggle.com to view the resulting test data score | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Super Simple Logistic Regression for Kaggle.com Titantic Competition