-
-
Save mick001/ac92e7c017aecff216fd to your computer and use it in GitHub Desktop.
# Load the raw training data and replace missing values with NA | |
training.data.raw <- read.csv('train.csv',header=T,na.strings=c("")) | |
# Output the number of missing values for each column | |
sapply(training.data.raw,function(x) sum(is.na(x))) | |
# Quick check for how many different values for each feature | |
sapply(training.data.raw, function(x) length(unique(x))) | |
# A visual way to check for missing data | |
library(Amelia) | |
missmap(training.data.raw, main = "Missing values vs observed") | |
# Subsetting the data | |
data <- subset(training.data.raw,select=c(2,3,5,6,7,8,10,12)) | |
# Substitute the missing values with the average value | |
data$Age[is.na(data$Age)] <- mean(data$Age,na.rm=T) | |
# R should automatically code Embarked as a factor(). A factor is R's way of dealing with | |
# categorical variables | |
is.factor(data$Sex) # Returns TRUE | |
is.factor(data$Embarked) # Returns TRUE | |
# Check categorical variables encoding for better understanding of the fitted model | |
contrasts(data$Sex) | |
contrasts(data$Embarked) | |
# Remove rows (Embarked) with NAs | |
data <- data[!is.na(data$Embarked),] | |
rownames(data) <- NULL | |
# Train test splitting | |
train <- data[1:800,] | |
test <- data[801:889,] | |
# Model fitting | |
model <- glm(Survived ~.,family=binomial(link='logit'),data=train) | |
summary(model) | |
# Analysis of deviance | |
anova(model,test="Chisq") | |
# McFadden R^2 | |
library(pscl) | |
pR2(model) | |
#------------------------------------------------------------------------------- | |
# MEASURING THE PREDICTIVE ABILITY OF THE MODEL | |
# If prob > 0.5 then 1, else 0. Threshold can be set for better results | |
fitted.results <- predict(model,newdata=subset(test,select=c(2,3,4,5,6,7,8)),type='response') | |
fitted.results <- ifelse(fitted.results > 0.5,1,0) | |
misClasificError <- mean(fitted.results != test$Survived) | |
print(paste('Accuracy',1-misClasificError)) | |
# Confusion matrix | |
library(caret) | |
confusionMatrix(data=fitted.results, reference=test$Survived) | |
library(ROCR) | |
# ROC and AUC | |
p <- predict(model, newdata=subset(test,select=c(2,3,4,5,6,7,8)), type="response") | |
pr <- prediction(p, test$Survived) | |
# TPR = sensitivity, FPR=specificity | |
prf <- performance(pr, measure = "tpr", x.measure = "fpr") | |
plot(prf) | |
auc <- performance(pr, measure = "auc") | |
auc <- [email protected][[1]] | |
auc |
@ harigovind-s-menon
because he did not need the passenger id(the first column) in the analytics so he removed it
#Please try the below codes for contrasts:
contrasts(as.factor(data$Sex))
contrasts(as.factor(data$Embarked)
Hi,
is this code for logistic regression as statistical modell or for machine learning. I need it for statistical modell, because I did my work with Machine learning and I would to model my dataset with normale logistic regression to compare with 3 machine learning methods.
Can you help me please?
Best Regards
Arezoo
Hello Arezoo,
The code is for logistic regression model and can be used as one of the simplest machine learning tools. There is a bit of information on this link you may find helpful: https://www.r-bloggers.com/how-to-perform-a-logistic-regression-in-r/ . The good thing is with this method you can get the accuracy to compare with other methods, vs. simple linear regression. You may also want to consider these models for comparative work: Decision Tree, KNN, SVM, ANN, and Naïve Bayes. Good luck to you!
Shahryar
thanks
sir,
i got the same accuracy as you did but
when i write code of confusion matrix it shows the following error in r
confusionMatrix(data=fitted.results, reference=test$Survived)
Error in confusionMatrix(data = fitted.results, reference = test$Survived) :
unused arguments (data = fitted.results, reference = test$Survived)
kindly reply
Hi, i managed to get the error corrected by putting it in a "table": see below
confusionMatrix(table(data=fitted.results, reference=test$Survived))
sir,
i got the same accuracy as you did but
when i write code of confusion matrix it shows the following error in rconfusionMatrix(data=fitted.results, reference=test$Survived)
Error in confusionMatrix(data = fitted.results, reference = test$Survived) :
unused arguments (data = fitted.results, reference = test$Survived)kindly reply
Hi, i managed to get the error corrected by putting it in a "table": see below
confusionMatrix(table(data=fitted.results, reference=test$Survived))
yes, it is working thanks :)
Pls help when i run the confusionMatrix, using table()) error msg i got was all arguments must have the same length. Pls help advise whats that problem
Fix for the error (data
and reference
should be factors with the same levels.)in
confusionMatrix(data=fitted.results, reference=test$Survived)
do:
install.packages("e1071")
confusionMatrix(data=as.factor(fitted.results), reference=as.factor(test$Survived))
Thanks for the code and tutorial!